In [None]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
df=pd.read_csv('/kaggle/input/flight-price-prediction/Clean_Dataset.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.airline.value_counts()

In [None]:
df.source_city.value_counts()

In [None]:
df.destination_city.value_counts()

In [None]:
df.departure_time.value_counts()

In [None]:
df.arrival_time.value_counts()

In [None]:
df.stops.value_counts()

In [None]:
df['class'].value_counts()

In [None]:
df['duration'].value_counts()

In [None]:
print(df['duration'].describe())


**Pre Processing**
* drop unnamed, flight
* hot encode airline, source & dest city, arrival & dep time
* turn stops into 0,1,2
* turn  class into 0,1

In [None]:
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('flight', axis=1)

df['class'] = df['class'].apply(lambda x:1 if x== 'Business' else 0)

In [None]:
df.stops = pd.factorize(df.stops)[0]

In [None]:
df = df.join(pd.get_dummies(df.airline, prefix = 'airline')).drop('airline', axis =1)
df = df.join(pd.get_dummies(df.source_city, prefix = 'source')).drop('source_city', axis =1)
df = df.join(pd.get_dummies(df.destination_city, prefix = 'dest')).drop('destination_city', axis =1)
df = df.join(pd.get_dummies(df.arrival_time, prefix = 'arrival')).drop('arrival_time', axis =1)
df = df.join(pd.get_dummies(df.departure_time, prefix = 'departure')).drop('departure_time', axis =1)

In [None]:
df

**Training Regression Model**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X,y = df.drop('price', axis=1), df.price

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
reg = RandomForestRegressor()

reg.fit(X_train, y_train)

In [None]:
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import absolute error to know how much we deviate from actual price

y_pred = reg.predict(X_test)

print('R2:', r2_score(y_test, y_pred))
print('MEA:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', math.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred)
plt.xlabel('Actual Flight Price')
plt.xlabel('Predicted Flight Price')
plt.title('Actual vs Predicted Flight Prices')

In [None]:
df.price.describe()

**Refining the  model**

Importance of features

In [None]:
importances = dict(zip(reg.feature_names_in_, reg.feature_importances_))
sorted_importances = sorted(importances.items(), key= lambda x: x[1], reverse=True)

sorted_importances

In [None]:
df.days_left.describe()

In [None]:
plt.figure(figsize=(15,10))
plt.bar([x[0] for x in sorted_importances[:10]], [x[1] for x in sorted_importances[:10]])

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor

param_dist = {
    'n_estimators': randint(100,300),
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': randint(2,11),
    'min_samples_leaf': randint(1,5),
    'max_features': [1.0, 'sqrt']  
}

reg= RandomForestRegressor(n_jobs=-1)

random_search = RandomizedSearchCV(estimator=reg, param_distributions=param_dist, n_iter=2, cv=3, scoring='neg_mean_squared_error', verbose=2, random_state=10, n_jobs=-1)
random_search.fit(X_train, y_train)

best_regressor = random_search.best_estimator_

In [None]:
best_regressor.score(X_test, y_test)

98.655% Accuracy

In [None]:
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import absolute error to know how much we deviate from actual price

y_pred = best_regressor.predict(X_test)

print('R2:', r2_score(y_test, y_pred))
print('MEA:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('RMSE:', math.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred)
plt.xlabel('Actual Flight Price')
plt.xlabel('Predicted Flight Price')
plt.title('Actual vs Predicted Flight Prices')