# Intalling Libraries

In [80]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/aviachipta-narxini-bashorat-qilish/train_data.csv
/kaggle/input/aviachipta-narxini-bashorat-qilish/test_data.csv
/kaggle/input/aviachipta-narxini-bashorat-qilish/sample_solution.csv


**Data Uploading**

In [None]:
# Uploading data
train_data = pd.read_csv("/kaggle/input/aviachipta-narxini-bashorat-qilish/train_data.csv",index_col=0)
test_data = pd.read_csv('/kaggle/input/aviachipta-narxini-bashorat-qilish/test_data.csv')
train_data.head()

# Data Preparation

In [84]:
# Testset and Trainset
train_set, test_set = train_test_split(train_data, test_size=0.2, random_state=42)

# Our label(Y) is 'price' column. So we'll seperate X and Y
X_train = train_set.drop(['price','flight'],axis=1) # We've dropped 'flight' col. Because I has low corr and makes the encoding complex
Y = train_set['price'].copy()

In [107]:
#X_train.info()
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16000 entries, 5895 to 15796
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           16000 non-null  object 
 1   source_city       16000 non-null  object 
 2   departure_time    16000 non-null  object 
 3   stops             16000 non-null  object 
 4   arrival_time      16000 non-null  object 
 5   destination_city  16000 non-null  object 
 6   class             16000 non-null  object 
 7   duration          16000 non-null  float64
 8   days_left         16000 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 1.2+ MB


In [86]:
# Creating encoder function
attribs_dic = {'airline':{'Vistara':0,'Air_India':1,'Indigo':2,'GO_FIRST':3,'AirAsia':4,'SpiceJet':5},
               'source_city':{'Mumbai':0,'Delhi':1,'Bangalore':2,'Kolkata':3,'Hyderabad':4,'Chennai':5},
               'destination_city':{'Mumbai':0,'Delhi':1,'Bangalore':2,'Kolkata':3,'Hyderabad':4,'Chennai':5},
               'departure_time':{'Morning':0,'Early_Morning':1,'Evening':2,'Night':3,'Afternoon':4,'Late_Night':5},
               'arrival_time':{'Morning':0,'Early_Morning':1,'Evening':2,'Night':3,'Afternoon':4,'Late_Night':5},
               'stops':{'zero':0,'one':1,'two_or_more':2},
               'class':{'Economy':0,'Business':1}
}

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # Our func is not fitter but transformer
    def transform(self, X):
        encoded = X.replace(attribs_dic)
        return encoded

In [87]:
# Creating pipelines for numerical(standaritazion) and categorical(encoding)

# combine numerical and categorical columns seperately
cat_attribs = ['airline','source_city','departure_time','stops','arrival_time','destination_city','class']
num_attribs = ['duration','days_left']

# pipeline for numerical ones
num_pipeline = Pipeline([('std_scale',StandardScaler())])

# full pipeline
full_pipeline = ColumnTransformer([('num',num_pipeline,num_attribs),
                                  ('cat',CategoricalEncoder(),cat_attribs)
])


# Training and Testing

**Training(Random Forest)**

In [88]:
#Putting all data to the full_pipeline
X_prepared = full_pipeline.fit_transform(X_train)

In [89]:
# Training
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, Y)

**Testing**

In [90]:
#Preparing test set
X_test = test_set.drop(['price','flight'],axis=1)  # We've dropped 'flight' col. Because I has low corr and makes the encoding complex
Y_test = test_set['price'].copy()

In [91]:
# Putting in pipeline
X_test_prepared = full_pipeline.fit_transform(X_test)

In [92]:
#  Prediction
y_predict_RF = RF_model.predict(X_test_prepared)

In [94]:
# Checking Error
mae = mean_absolute_error(Y_test, y_predict_RF)
print('MAE = ',mae)

MAE =  2038.453156875


# Predicting the price for the test_data (Answer)

In [110]:
df = test_data.drop('flight',axis=1)  # We've dropped 'flight' col. Because I has low corr and makes the encoding complex

In [111]:
#Preparing for ML
df_prep = full_pipeline.fit_transform(df)

In [112]:
#Prediction
price_pred = RF_model.predict(df_prep)

In [118]:
#Answer
answer = pd.DataFrame({'id':df['id'],'price':price_pred})
answer

Unnamed: 0,id,price
0,1,53552.92
1,2,53700.86
2,3,23133.60
3,4,2410.50
4,5,6097.11
...,...,...
4995,4996,58981.31
4996,4997,4395.00
4997,4998,49764.57
4998,4999,48443.18
