In [1]:
# Import relevant packages
import os
import tarfile
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

# Check where the current operating system directory is
print(os.getcwd())

C:\Users\liyan\Documents\MSc Business Analytics\Final Project


In [2]:
# Update the new path to the relevant pathway and check which files are there
path = "C:/Users/liyan\Documents/MSc Business Analytics/Final Project/"
os.chdir( path )
print(os.getcwd())
os.listdir(os.getcwd())

C:\Users\liyan\Documents\MSc Business Analytics\Final Project


['.ipynb_checkpoints',
 'GeneralCommentsOnSabreProject1.docx',
 'Insurance attempt 1.ipynb',
 'ml_project1_20190418.csv',
 'Notes from Initial meeting 07052019.docx',
 'Sabre NDA - Nisal.pdf',
 '~$tes from Initial meeting 07052019.docx']

In [3]:
# Get the training data  
insurance_train = pd.read_csv('ml_project1_20190418.csv')

In [4]:
# Check the data types and number of entries for each feature for the training set. 
# As we can see, there are no missing entries
insurance_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795672 entries, 0 to 1795671
Data columns (total 89 columns):
uniqueID    int64
Var01       object
Var03       object
Var07       int64
Var10       float64
Var11       object
Var13       object
Var14       object
Var19       object
Var20       object
Var21       object
Var23       object
Var25       object
Var26       float64
Var27       float64
Var28       float64
Var29       float64
Var30       int64
Var31       int64
Var32       int64
Var33       int64
Var34       int64
Var35       int64
Var42       int64
pred_s      float64
Var47       float64
Var48       object
Var49       object
Var50       int64
Var51       object
Var52       object
Var53       object
Var55       object
c01         object
c02         object
c03         object
c06         object
c07         object
c08         float64
c09         float64
c10         float64
c11         float64
c12         float64
c13         object
c14         object
c15         float64
c17       

In [5]:
# Look at some of the unique values within each variable
for col in list(insurance_train.columns):
    
    # get a list of unique values
    uniques = insurance_train[col].unique()
    
    # if number of unique values is less than 30, print the values. Otherwise print the number of unique values
    if len(uniques)<30:
        print(col + ':')
        print(uniques)
    else:
        print(col + ': ' +str(len(uniques)) + ' unique values')

uniqueID: 849534 unique values
Var01:
['A' 'B' 'C']
Var03:
['A' 'B' 'Q' 'G' 'H' 'C' 'D' 'F' 'E' 'L' 'N' 'Z' 'I' 'P' 'J']
Var07:
[0 1 4 3 2 5]
Var10:
[ 2.  5.  4.  6.  7.  8.  9. 10. 12. 11.  1.  3. 14. 15. 13. -1. 17. 16.
 nan]
Var11:
['B' 'A' 'E' 'D' 'C' 'G' 'F' 'Z']
Var13:
['A' 'E' 'I' 'F' 'B' 'D' 'L' 'C' 'G' 'J' 'N' 'K' 'M' 'Q' 'P' 'Z' 'H' 'O'
 'S' 'R']
Var14:
['A' 'B' 'C' 'E' 'D' 'Z' 'F']
Var19:
['C' 'A' 'F' 'B' 'E' 'D' 'G' 'Z']
Var20:
['B' 'A']
Var21:
['A' 'D' 'C' 'B' 'Z']
Var23:
['A' 'Z' 'C' 'B']
Var25:
['F' 'G' 'B' 'C' 'A' 'K' 'M' 'J' 'I' 'H' 'E' 'L' 'D' 'Z']
Var26:
[15. 13.  8.  6. 11. 18. 10.  7. 14. 17.  2. 19.  9. 12.  5.  1.  3. 20.
 16.  4. nan]
Var27:
[16.  8.  5.  1. 13.  9.  6.  3. 15.  2.  4. 17. 10. 12. 14. -1.  7. 11.
 nan]
Var28:
[ 9.  6. 12. 10.  8.  4.  7.  1. 11.  2.  5. 15. 14. 13.  3. nan]
Var29:
[10.  8.  4.  2.  1. 14.  6.  3. 16.  5. 12.  9. -1. 11. 13.  7. 15. 17.
 nan]
Var30:
[0 1 3 2]
Var31:
[0 1 2 3]
Var32:
[0 1 3 2]
Var33:
[0 1 2 3]
Var34:
[0 2 1 3]
Var

In [6]:
## training set - we drop 'train' column after this
data_train = insurance_train.loc[insurance_train.train=="Y"]
data_train.drop(['train'],axis=1,inplace=True)
data_train.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(1647565, 88)

In [7]:
## test set - we drop 'train' column after this
data_test = insurance_train.loc[insurance_train.train=="N"]
data_test.drop(['train'],axis=1,inplace=True)
data_test.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


(148107, 88)

In [8]:
total = data_train.isnull().sum()
pct = total/data_train.isnull().count()
NAs = pd.concat([total,pct],axis=1,keys=('Total','Pct'))
NAs[NAs.Total>0].sort_values(by='Total',ascending=False)

Unnamed: 0,Total,Pct
c04,664123,0.403094
c39,437060,0.265276
c40,432604,0.262572
c38,422082,0.256185
c16,259121,0.157275
c05,252524,0.153271
id,163333,0.099136
c18,157811,0.095784
c15,155381,0.094309
c08,154676,0.093882


In [9]:
# Need to do something else for pred_s
for column in data_train:
    if data_train[column].isnull().sum()>0:
        data_train[column].fillna(data_train[column].max()+1, inplace=True)
    else: 
        pass

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [None]:
total = data_train.isnull().sum()
pct = total/data_train.isnull().count()
NAs = pd.concat([total,pct],axis=1,keys=('Total','Pct'))
NAs[NAs.Total>0].sort_values(by='Total',ascending=False)

In [10]:
# Split numeric and categorical features for training set
cat_train_feats = data_train.select_dtypes(include=['object']).columns
num_train_feats = data_train.select_dtypes(include=['float64','int64']).columns
num_train_df = data_train[num_train_feats]
cat_train_df = data_train[cat_train_feats]
print('There are %d numeric features and %d categorical features in the training dataset\n' %(len(num_train_feats),len(cat_train_feats)))
print('Numeric features:\n',num_train_feats.values)
print('Categorical features:\n',cat_train_feats.values)

There are 49 numeric features and 39 categorical features in the training dataset

Numeric features:
 ['uniqueID' 'Var07' 'Var10' 'Var26' 'Var27' 'Var28' 'Var29' 'Var30'
 'Var31' 'Var32' 'Var33' 'Var34' 'Var35' 'Var42' 'pred_s' 'Var47' 'Var50'
 'c08' 'c09' 'c10' 'c11' 'c12' 'c15' 'c18' 'c19' 'c27' 'Var54' 'Var02'
 'Var04' 'Var05' 'Var06' 'Var08' 'Var09' 'Var12' 'Var15' 'Var16' 'Var18'
 'Var22' 'Var24' 'c04' 'c05' 'c16' 'c26' 'c34' 'c35' 'c38' 'c39' 'c40'
 'id']
Categorical features:
 ['Var01' 'Var03' 'Var11' 'Var13' 'Var14' 'Var19' 'Var20' 'Var21' 'Var23'
 'Var25' 'Var48' 'Var49' 'Var51' 'Var52' 'Var53' 'Var55' 'c01' 'c02' 'c03'
 'c06' 'c07' 'c13' 'c14' 'c17' 'c20' 'c21' 'c22' 'c23' 'c24' 'c25' 'c28'
 'c29' 'c30' 'c31' 'c32' 'c33' 'c36' 'c37' 'Var56']


In [11]:
# One hot encoding to make categorical features numerical for analysis purposes
cat_train_df = pd.get_dummies(cat_train_df)
cat_train_df.head()

Unnamed: 0,Var01_A,Var01_B,Var01_C,Var03_A,Var03_B,Var03_C,Var03_D,Var03_E,Var03_F,Var03_G,...,c32_C,c33_A,c33_B,c33_C,c36_N,c36_Y,c37_N,c37_Y,Var56_A,Var56_B
0,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
1,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
2,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
3,1,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0
4,0,1,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,1,0


In [12]:
# Combine the numerical and categorical features for total dataset represented numerically
all_train = pd.concat([num_train_df,cat_train_df],axis=1)
print(all_train.shape)

(1647565, 277)


In [13]:
train_label=all_train['Var47']

In [14]:
# Drop the training label and the 'duration' variables from the training dataset
all_train.drop(['Var47', 'uniqueID'],axis=1,inplace=True)

In [15]:
# Create the validation data from the training set because original test set had no labels, and split training set.
from sklearn.model_selection import train_test_split, cross_val_score
train_df, val_df, y_train, y_val = train_test_split(all_train,train_label,test_size = 0.2,random_state=42)

print(train_df.shape)
print(y_train.shape)
print(val_df.shape)
print(y_val.shape)

(1318052, 275)
(1318052,)
(329513, 275)
(329513,)


In [17]:
# Train another model using entire training dataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score
log_reg=LogisticRegression(random_state = 42)
log_reg.fit(train_df, y_train)

MemoryError: 

In [None]:
train_df.info()

In [None]:
df = {'Country': ['Belgium', 'India', 'Brazil', 'China'], 
     'Capital': [1, 2, 3,np.nan],
     'Population': [3,5,60,np.nan ]}
df1 = pd.DataFrame(df, columns=['Country', 'Capital', 'Population'])

In [None]:
df1

In [None]:
for column in df1:
    if df1[column].isnull().sum()>0:
        df1[column].fillna(df1[column].max()+1, inplace=True)
    else: 
        pass

In [None]:
df1