In [0]:
# To download files
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':'1J8TOqICEwGDov5ymJUomeiPRn2FVZwwe'})
downloaded.GetContentFile('train.csv')

downloaded = drive.CreateFile({'id':'1kP7inhf8t2hPvRlTxyU56nI3DRtmUwDQ'})
downloaded.GetContentFile('test.csv')

downloaded = drive.CreateFile({'id':'1yP-mQHvkkncFAV_Mxyyna05ZyC0GzPKe'})
downloaded.GetContentFile('sample_submission.csv')

In [0]:
import pandas as pd
import numpy as np

In [0]:
# load data
training_data = pd.read_csv('train.csv')
testing_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [4]:
training_data.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [0]:
# id is only needed for submission
# not needed for training or testing data
submission_ids = testing_data['id'] # used for submission later
training_data = training_data.drop('id', 1)
testing_data = testing_data.drop('id', 1)

In [6]:
# this is to check the possible values in the categorical features
s = set()
for feat in training_data.columns:
  if 'cat' in feat:
    s.update(training_data[feat].unique())
s

{'A',
 'AA',
 'AB',
 'AC',
 'AD',
 'AE',
 'AF',
 'AG',
 'AH',
 'AI',
 'AJ',
 'AK',
 'AL',
 'AM',
 'AN',
 'AO',
 'AP',
 'AQ',
 'AR',
 'AS',
 'AT',
 'AU',
 'AV',
 'AW',
 'AX',
 'AY',
 'B',
 'BA',
 'BB',
 'BC',
 'BD',
 'BE',
 'BF',
 'BG',
 'BH',
 'BI',
 'BJ',
 'BK',
 'BL',
 'BM',
 'BN',
 'BO',
 'BP',
 'BQ',
 'BR',
 'BS',
 'BT',
 'BU',
 'BV',
 'BW',
 'BX',
 'BY',
 'C',
 'CA',
 'CB',
 'CC',
 'CD',
 'CE',
 'CF',
 'CG',
 'CH',
 'CI',
 'CJ',
 'CK',
 'CL',
 'CM',
 'CN',
 'CO',
 'CP',
 'CQ',
 'CR',
 'CS',
 'CT',
 'CU',
 'CV',
 'CW',
 'CX',
 'CY',
 'D',
 'DA',
 'DB',
 'DC',
 'DD',
 'DE',
 'DF',
 'DG',
 'DH',
 'DI',
 'DJ',
 'DK',
 'DL',
 'DM',
 'DN',
 'DO',
 'DP',
 'DQ',
 'DR',
 'DS',
 'DT',
 'DU',
 'DV',
 'DW',
 'DX',
 'DY',
 'E',
 'EA',
 'EB',
 'EC',
 'ED',
 'EE',
 'EF',
 'EG',
 'EH',
 'EI',
 'EJ',
 'EK',
 'EL',
 'EM',
 'EN',
 'EO',
 'EP',
 'EQ',
 'ES',
 'EU',
 'EV',
 'EW',
 'EY',
 'F',
 'FA',
 'FB',
 'FC',
 'FD',
 'FE',
 'FF',
 'FG',
 'FH',
 'FI',
 'FJ',
 'FK',
 'FL',
 'FM',
 'FN',
 'FO',
 'FP'

In [0]:
# this function will return integer from characters
# Example:
# encode('A') returns 1
# encode('Z') returns 26
# encode('AA') returns 27
def encode(c):
    res = 0
    c_len = len(str(c))
    for i in range(c_len):
        res += (ord(str(c)[i]) - ord('A') + 1) * 26 ** (c_len - i - 1)
    return res

In [2]:
encode('ZZ')

702

In [0]:
# apply the above function for all the categorical features
for feat in training_data.columns:
  if 'cat' in feat:
    training_data[feat] = training_data[feat].apply(encode)
    testing_data[feat] = testing_data[feat].apply(encode)

In [9]:
training_data.head()

Unnamed: 0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,2,1,2,1,1,1,1,2,1,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,1,2,1,1,1,1,1,1,2,2,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,1,2,1,1,2,1,1,1,2,2,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,2,2,1,2,1,1,1,1,2,1,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,1,2,1,2,1,1,1,1,2,2,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [0]:
# convert dataframe to csv file
training_data.to_csv('train_clean.csv', index=False)
testing_data.to_csv('test_clean.csv', index=False)

In [0]:
# download the csv files
from google.colab import files
files.download('train_clean.csv')
files.download('test_clean.csv')