# Airline Arrivals

In [1]:
#load library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score


%matplotlib inline

# set seed for reproducibility
np.random.seed(0)

# read in all our data
airline_arrival = pd.read_csv('2008.csv')

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
#Xem 5 dòng dữ liệu đầu tiên
airline_arrival.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,N712SW,128.0,150.0,116.0,-14.0,8.0,IAD,TPA,810,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,N772SW,128.0,145.0,113.0,2.0,19.0,IAD,TPA,810,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,N428WN,96.0,90.0,76.0,14.0,8.0,IND,BWI,515,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,N612SW,88.0,90.0,78.0,-6.0,-4.0,IND,BWI,515,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,N464WN,90.0,90.0,77.0,34.0,34.0,IND,BWI,515,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0


In [4]:
#Xem 5 dòng dữ liệu cuối cùng
airline_arrival.tail()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
7009723,2008,12,13,6,1002.0,959,1204.0,1150,DL,1636,N646DL,122.0,111.0,71.0,14.0,3.0,ATL,IAD,533,6.0,45.0,0,,0,,,,,
7009724,2008,12,13,6,834.0,835,1021.0,1023,DL,1637,N908DL,167.0,168.0,139.0,-2.0,-1.0,ATL,SAT,874,5.0,23.0,0,,0,,,,,
7009725,2008,12,13,6,655.0,700,856.0,856,DL,1638,N671DN,121.0,116.0,85.0,0.0,-5.0,PBI,ATL,545,24.0,12.0,0,,0,,,,,
7009726,2008,12,13,6,1251.0,1240,1446.0,1437,DL,1639,N646DL,115.0,117.0,89.0,9.0,11.0,IAD,ATL,533,13.0,13.0,0,,0,,,,,
7009727,2008,12,13,6,1110.0,1103,1413.0,1418,DL,1641,N908DL,123.0,135.0,104.0,-5.0,7.0,SAT,ATL,874,8.0,11.0,0,,0,,,,,


In [5]:
#Xem số dòng, số cột của dataframe
airline_arrival.shape

(7009728, 29)

In [6]:
airline_arrival_pos = airline_arrival[airline_arrival['Cancelled'] == 1]
airline_arrival_neutral = airline_arrival[airline_arrival['Cancelled'] == 0].sample(n=1009728, random_state=50)

In [7]:
airline_arrival_pos.shape

(137434, 29)

In [8]:
airline_arrival = airline_arrival_pos.append(airline_arrival_neutral);

In [9]:
airline_arrival.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,1147162.0,2008.0,0.0,2008.0,2008.0,2008.0,2008.0,2008.0
Month,1147162.0,6.302392,3.443536,1.0,3.0,6.0,9.0,12.0
DayofMonth,1147162.0,15.662167,8.778754,1.0,8.0,16.0,23.0,31.0
DayOfWeek,1147162.0,3.916639,1.986166,1.0,2.0,4.0,6.0,7.0
DepTime,1010916.0,1333.401533,478.071983,1.0,927.0,1325.0,1728.0,2400.0
CRSDepTime,1147162.0,1331.175326,463.909683,0.0,930.0,1325.0,1720.0,2359.0
ArrTime,1007674.0,1481.418644,504.837052,1.0,1107.0,1512.0,1909.0,2400.0
CRSArrTime,1147162.0,1499.488217,482.2087,0.0,1117.0,1523.0,1910.0,2400.0
FlightNum,1147162.0,2282.127768,1986.125637,1.0,642.0,1613.0,3665.0,9741.0
ActualElapsedTime,1007210.0,127.343556,70.186991,14.0,77.0,110.0,157.0,837.0


In [10]:
airline_arrival.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

## Data cleaning

In [11]:
# Số lượng điểm data bị thiếu trên mỗi cột
missing_values_count = airline_arrival.isnull().sum()

# Số lượng điểm data bị thiếu trên mỗi cột của 10 cột đầu
missing_values_count

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime               136246
CRSDepTime                 0
ArrTime               139488
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                83359
ActualElapsedTime     139952
CRSElapsedTime           349
AirTime               139952
ArrDelay              139952
DepDelay              136246
Origin                     0
Dest                       0
Distance                   0
TaxiIn                139488
TaxiOut               137058
Cancelled                  0
CancellationCode     1009728
Diverted                   0
CarrierDelay          922608
WeatherDelay          922608
NASDelay              922608
SecurityDelay         922608
LateAircraftDelay     922608
dtype: int64

In [12]:
airline_arrival.Cancelled.value_counts()

0    1009728
1     137434
Name: Cancelled, dtype: int64

## Xử lý Missing Value

In [13]:
airline_arrival['DepTime'].fillna((int)(airline_arrival['DepTime'].mean()), inplace=True)

In [14]:
airline_arrival['ArrTime'].fillna((int)(airline_arrival['ArrTime'].mean()), inplace=True)

In [15]:
len(airline_arrival['TailNum'].unique())#.value_counts()

5338

In [16]:
airline_arrival['TailNum'].value_counts().head(1)

N479HA    676
Name: TailNum, dtype: int64

In [17]:
airline_arrival['TailNum'].fillna('N476HA', inplace=True)

In [18]:
airline_arrival['ActualElapsedTime'].fillna((int)(airline_arrival['ActualElapsedTime'].mean()), inplace=True)

In [19]:
airline_arrival['CRSElapsedTime'].fillna((int)(airline_arrival['CRSElapsedTime'].mean()), inplace=True)

In [20]:
airline_arrival['AirTime'].fillna((int)(airline_arrival['AirTime'].mean()), inplace=True)

In [21]:
airline_arrival['ArrDelay'].fillna((int)(airline_arrival['ArrDelay'].mean()), inplace=True)

In [22]:
airline_arrival['DepDelay'].fillna((int)(airline_arrival['DepDelay'].mean()), inplace=True)

In [23]:
airline_arrival['TaxiIn'].fillna((int)(airline_arrival['TaxiIn'].mean()), inplace=True)

In [24]:
airline_arrival['TaxiOut'].fillna((int)(airline_arrival['TaxiOut'].mean()), inplace=True)

In [25]:
airline_arrival.CancellationCode.value_counts()

B    54904
A    54330
C    28188
D       12
Name: CancellationCode, dtype: int64

In [26]:
airline_arrival[airline_arrival['CancellationCode'].isnull()].shape

(1009728, 29)

In [27]:
airline_arrival[airline_arrival['CancellationCode'].notnull()].shape

(137434, 29)

In [28]:
airline_arrival.shape

(1147162, 29)

In [29]:
airline_arrival.drop(['CancellationCode'], axis = 1, inplace = True)

In [30]:
airline_arrival[airline_arrival['CarrierDelay'].notnull()].shape

(224554, 28)

In [31]:
airline_arrival['CarrierDelay'].fillna((int)(airline_arrival['CarrierDelay'].mean()), inplace=True)

In [32]:
airline_arrival[airline_arrival['WeatherDelay'].notnull()].shape
airline_arrival['WeatherDelay'].fillna((int)(airline_arrival['WeatherDelay'].mean()), inplace=True)

In [33]:
airline_arrival[airline_arrival['NASDelay'].notnull()].shape

(224554, 28)

In [34]:
airline_arrival['NASDelay'].fillna((int)(airline_arrival['NASDelay'].mean()), inplace=True)

In [35]:
airline_arrival[airline_arrival['SecurityDelay'].notnull()].shape

(224554, 28)

In [36]:
airline_arrival['SecurityDelay'].fillna((int)(airline_arrival['SecurityDelay'].mean()), inplace=True)

In [37]:
airline_arrival[airline_arrival['LateAircraftDelay'].notnull()].shape

(224554, 28)

In [38]:
airline_arrival['LateAircraftDelay'].fillna((int)(airline_arrival['LateAircraftDelay'].mean()), inplace=True)

In [39]:
# Số lượng điểm data bị thiếu trên mỗi cột
missing_values_count = airline_arrival.isnull().sum()

# Số lượng điểm data bị thiếu trên mỗi cột của 10 cột đầu
missing_values_count

Year                 0
Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
CRSDepTime           0
ArrTime              0
CRSArrTime           0
UniqueCarrier        0
FlightNum            0
TailNum              0
ActualElapsedTime    0
CRSElapsedTime       0
AirTime              0
ArrDelay             0
DepDelay             0
Origin               0
Dest                 0
Distance             0
TaxiIn               0
TaxiOut              0
Cancelled            0
Diverted             0
CarrierDelay         0
WeatherDelay         0
NASDelay             0
SecurityDelay        0
LateAircraftDelay    0
dtype: int64

In [40]:
airline_arrival.dtypes

Year                   int64
Month                  int64
DayofMonth             int64
DayOfWeek              int64
DepTime              float64
CRSDepTime             int64
ArrTime              float64
CRSArrTime             int64
UniqueCarrier         object
FlightNum              int64
TailNum               object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin                object
Dest                  object
Distance               int64
TaxiIn               float64
TaxiOut              float64
Cancelled              int64
Diverted               int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
dtype: object

In [41]:
len(airline_arrival.UniqueCarrier.unique())#value_counts()

20

In [42]:
len(airline_arrival.TailNum.unique())#value_counts()

5337

In [43]:
len(airline_arrival.Origin.unique())#value_counts()

302

In [44]:
len(airline_arrival.Dest.unique())#value_counts()

303

In [45]:
#airline_arrival = pd.get_dummies(airline_arrival, columns=['UniqueCarrier', 'TailNum', 'Origin', 'Dest']);

In [46]:
le = LabelEncoder()

airline_arrival['UniqueCarrier_LabelEncoded'] = le.fit_transform(airline_arrival.UniqueCarrier)

In [47]:
airline_arrival['TailNum_LabelEncoded'] = le.fit_transform(airline_arrival.TailNum)

In [48]:
airline_arrival['Origin_LabelEncoded'] = le.fit_transform(airline_arrival.Origin)

In [49]:
airline_arrival['Dest_LabelEncoded'] = le.fit_transform(airline_arrival.Dest)

In [50]:
airline_arrival.drop(['UniqueCarrier'], axis = 1, inplace = True)
airline_arrival.drop(['TailNum'], axis = 1, inplace = True)
airline_arrival.drop(['Origin'], axis = 1, inplace = True)
airline_arrival.drop(['Dest'], axis = 1, inplace = True)

In [51]:
airline_arrival.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,UniqueCarrier_LabelEncoded,TailNum_LabelEncoded,Origin_LabelEncoded,Dest_LabelEncoded
178,2008,1,3,4,1333.0,700,1481.0,830,126,127.0,90.0,104.0,8.0,9.0,407,6.0,16.0,1,0,15.0,2.0,17.0,0.0,20.0,17,2162,155,204
373,2008,1,3,4,1333.0,1100,1481.0,1215,1146,127.0,75.0,104.0,8.0,9.0,337,6.0,16.0,1,0,15.0,2.0,17.0,0.0,20.0,17,2162,157,204
399,2008,1,3,4,1333.0,905,1481.0,1025,469,127.0,80.0,104.0,8.0,9.0,337,6.0,16.0,1,0,15.0,2.0,17.0,0.0,20.0,17,2162,157,259
401,2008,1,3,4,1333.0,1620,1481.0,1740,618,127.0,80.0,104.0,8.0,9.0,337,6.0,16.0,1,0,15.0,2.0,17.0,0.0,20.0,17,2162,157,259
415,2008,1,3,4,1333.0,1930,1481.0,2035,2528,127.0,65.0,104.0,8.0,9.0,308,6.0,16.0,1,0,15.0,2.0,17.0,0.0,20.0,17,2162,157,264


In [52]:
airline_arrival.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'FlightNum', 'ActualElapsedTime',
       'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance',
       'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'UniqueCarrier_LabelEncoded', 'TailNum_LabelEncoded',
       'Origin_LabelEncoded', 'Dest_LabelEncoded'],
      dtype='object')

### Xử lý kiểu Numberic, đưa về cùng 1 scale (mean=0, std=1)

In [53]:
airline_arrival['Year'] = (airline_arrival['Year'] - airline_arrival['Year'].mean())/airline_arrival['Year'].std()
airline_arrival['Month'] = (airline_arrival['Month'] - airline_arrival['Month'].mean())/airline_arrival['Month'].std()
airline_arrival['DayofMonth'] = (airline_arrival['DayofMonth'] - airline_arrival['DayofMonth'].mean())/airline_arrival['DayofMonth'].std()
airline_arrival['DayOfWeek'] = (airline_arrival['DayOfWeek'] - airline_arrival['DayOfWeek'].mean())/airline_arrival['DayOfWeek'].std()
airline_arrival['DepTime'] = (airline_arrival['DepTime'] - airline_arrival['DepTime'].mean())/airline_arrival['DepTime'].std()
airline_arrival['CRSDepTime'] = (airline_arrival['CRSDepTime'] - airline_arrival['CRSDepTime'].mean())/airline_arrival['CRSDepTime'].std()
airline_arrival['ArrTime'] = (airline_arrival['ArrTime'] - airline_arrival['ArrTime'].mean())/airline_arrival['ArrTime'].std()
airline_arrival['CRSArrTime'] = (airline_arrival['CRSArrTime'] - airline_arrival['CRSArrTime'].mean())/airline_arrival['CRSArrTime'].std()
airline_arrival['FlightNum'] = (airline_arrival['FlightNum'] - airline_arrival['FlightNum'].mean())/airline_arrival['FlightNum'].std()
airline_arrival['ActualElapsedTime'] = (airline_arrival['ActualElapsedTime'] - airline_arrival['ActualElapsedTime'].mean())/airline_arrival['ActualElapsedTime'].std()
airline_arrival['CRSElapsedTime'] = (airline_arrival['CRSElapsedTime'] - airline_arrival['CRSElapsedTime'].mean())/airline_arrival['CRSElapsedTime'].std()
airline_arrival['AirTime'] = (airline_arrival['AirTime'] - airline_arrival['AirTime'].mean())/airline_arrival['AirTime'].std()
airline_arrival['ArrDelay'] = (airline_arrival['ArrDelay'] - airline_arrival['ArrDelay'].mean())/airline_arrival['ArrDelay'].std()
airline_arrival['DepDelay'] = (airline_arrival['DepDelay'] - airline_arrival['DepDelay'].mean())/airline_arrival['DepDelay'].std()
airline_arrival['Distance'] = (airline_arrival['Distance'] - airline_arrival['Distance'].mean())/airline_arrival['Distance'].std()
airline_arrival['TaxiIn'] = (airline_arrival['TaxiIn'] - airline_arrival['TaxiIn'].mean())/airline_arrival['TaxiIn'].std()
airline_arrival['TaxiOut'] = (airline_arrival['TaxiOut'] - airline_arrival['TaxiOut'].mean())/airline_arrival['TaxiOut'].std()
airline_arrival['Diverted'] = (airline_arrival['Diverted'] - airline_arrival['Diverted'].mean())/airline_arrival['Diverted'].std()
airline_arrival['CarrierDelay'] = (airline_arrival['CarrierDelay'] - airline_arrival['CarrierDelay'].mean())/airline_arrival['CarrierDelay'].std()
airline_arrival['WeatherDelay'] = (airline_arrival['WeatherDelay'] - airline_arrival['WeatherDelay'].mean())/airline_arrival['WeatherDelay'].std()
airline_arrival['NASDelay'] = (airline_arrival['NASDelay'] - airline_arrival['NASDelay'].mean())/airline_arrival['NASDelay'].std()
airline_arrival['SecurityDelay'] = (airline_arrival['SecurityDelay'] - airline_arrival['SecurityDelay'].mean())/airline_arrival['SecurityDelay'].std()
airline_arrival['LateAircraftDelay'] = (airline_arrival['LateAircraftDelay'] - airline_arrival['LateAircraftDelay'].mean())/airline_arrival['LateAircraftDelay'].std()
airline_arrival['UniqueCarrier_LabelEncoded'] = (airline_arrival['UniqueCarrier_LabelEncoded'] - airline_arrival['UniqueCarrier_LabelEncoded'].mean())/airline_arrival['UniqueCarrier_LabelEncoded'].std()
airline_arrival['TailNum_LabelEncoded'] = (airline_arrival['TailNum_LabelEncoded'] - airline_arrival['TailNum_LabelEncoded'].mean())/airline_arrival['TailNum_LabelEncoded'].std()
airline_arrival['Origin_LabelEncoded'] = (airline_arrival['Origin_LabelEncoded'] - airline_arrival['Origin_LabelEncoded'].mean())/airline_arrival['Origin_LabelEncoded'].std()
airline_arrival['Dest_LabelEncoded'] = (airline_arrival['Dest_LabelEncoded'] - airline_arrival['Dest_LabelEncoded'].mean())/airline_arrival['Dest_LabelEncoded'].std()

In [54]:
len(airline_arrival.Cancelled.unique())

2

In [55]:
airline_arrival.Cancelled.value_counts()

0    1009728
1     137434
Name: Cancelled, dtype: int64

In [56]:
# fig = plt.figure(figsize=(8,6))
# airline_arrival.groupby('Cancelled').Year.count().plot.bar(ylim=0)
# plt.show()

## Model

In [57]:
feature_columns = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'FlightNum', 'ActualElapsedTime',
       'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Distance',
       'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'UniqueCarrier_LabelEncoded', 'TailNum_LabelEncoded',
       'Origin_LabelEncoded', 'Dest_LabelEncoded']
X = airline_arrival[feature_columns]
y = airline_arrival['Cancelled']

In [58]:
X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,UniqueCarrier_LabelEncoded,TailNum_LabelEncoded,Origin_LabelEncoded,Dest_LabelEncoded
178,-1.539811,-1.442365,0.041971,-0.000788,-1.360556,-0.000777,-1.388379,-1.085595,-0.004587,-0.54673,-0.000144,-0.004016,-0.026474,-0.549458,-0.16379,-0.039502,1,-0.046902,-0.008736,-0.022285,-0.003074,-0.018556,-0.007198,1.000444,-0.360784,0.087034,0.689675
373,-1.539811,-1.442365,0.041971,-0.000788,-0.49832,-0.000777,-0.589969,-0.572032,-0.004587,-0.765911,-0.000144,-0.004016,-0.026474,-0.675865,-0.16379,-0.039502,1,-0.046902,-0.008736,-0.022285,-0.003074,-0.018556,-0.007198,1.000444,-0.360784,0.111935,0.689675
399,-1.539811,-1.442365,0.041971,-0.000788,-0.91866,-0.000777,-0.983989,-0.912897,-0.004587,-0.692851,-0.000144,-0.004016,-0.026474,-0.675865,-0.16379,-0.039502,1,-0.046902,-0.008736,-0.022285,-0.003074,-0.018556,-0.007198,1.000444,-0.360784,0.111935,1.37124
401,-1.539811,-1.442365,0.041971,-0.000788,0.622588,-0.000777,0.498771,-0.837876,-0.004587,-0.692851,-0.000144,-0.004016,-0.026474,-0.675865,-0.16379,-0.039502,1,-0.046902,-0.008736,-0.022285,-0.003074,-0.018556,-0.007198,1.000444,-0.360784,0.111935,1.37124
415,-1.539811,-1.442365,0.041971,-0.000788,1.290822,-0.000777,1.110539,0.123795,-0.004587,-0.912032,-0.000144,-0.004016,-0.026474,-0.728234,-0.16379,-0.039502,1,-0.046902,-0.008736,-0.022285,-0.003074,-0.018556,-0.007198,1.000444,-0.360784,0.111935,1.4332


# LogisticRegression

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    252736
          1       1.00      1.00      1.00     34055

avg / total       1.00      1.00      1.00    286791



# SVM

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = LinearSVC().fit(X_train, y_train)
y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    252736
          1       1.00      1.00      1.00     34055

avg / total       1.00      1.00      1.00    286791



# Random Forest Classifier

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0).fit(X_train, y_train)
y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    252736
          1       1.00      1.00      1.00     34055

avg / total       1.00      1.00      1.00    286791



# Decision Tree

In [62]:
from sklearn import tree
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
#We call our estimator instance clf, as it is a classifier.
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# MultinomialNB_accuracy = accuracy_score(y_test, y_pred)*100

print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    252736
          1       1.00      1.00      1.00     34055

avg / total       1.00      1.00      1.00    286791

