In [6]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
import operator

In [2]:
train_fname = 'data/train.csv'
test_fname = 'data/test.csv'
df = pd.read_csv(train_fname, sep=';')
df_test = pd.read_csv(test_fname, sep=';')
(n_samples,n_variables) = df.shape

In [3]:
f_con = ['APP_NB','APP_NB_PAYS','APP_NB_TYPE','NB_CLASSES','NB_ROOT_CLASSES','NB_SECTORS','NB_FIELDS','INV_NB',
        'INV_NB_PAYS','INV_NB_TYPE','cited_n','cited_age_min','cited_age_median','cited_age_max','cited_age_mean',
        'cited_age_std','NB_BACKWARD_NPL','NB_BACKWARD_XY','NB_BACKWARD_I','NB_BACKWARD_AUTRE','NB_BACKWARD_PL',
        'NB_BACKWARD','pct_NB_IPC','pct_NB_IPC_LY','oecd_NB_ROOT_CLASSES','oecd_NB_BACKWARD_PL','oecd_NB_BACKWARD_NPL',
        'IDX_ORIGIN','IDX_RADIC','PRIORITY_MONTH','FILING_MONTH','PUBLICATION_MONTH','BEGIN_MONTH']
f_cat = ['VOIE_DEPOT','COUNTRY','SOURCE_BEGIN_MONTH','FISRT_APP_COUNTRY','FISRT_APP_TYPE','LANGUAGE_OF_FILLING',
        'FIRST_CLASSE','TECHNOLOGIE_SECTOR','TECHNOLOGIE_FIELD','MAIN_IPC','FISRT_INV_COUNTRY','FISRT_INV_TYPE','SOURCE_CITED_AGE',
        'SOURCE_IDX_ORI','SOURCE_IDX_RAD']

## 1) Données continues.

In [4]:
X_train_con = df[f_con].values
X_test_con = df_test[f_con].values

In [7]:
for i in range(n_samples):
    for j in range(29,33):
        if(isinstance(X_train_con[i][j],six.string_types)):
            str = X_train_con[i][j].split('/')
            X_train_con[i][j] = int(str[0]) + 12*int(str[1])

for i in range(X_test_con.shape[0]):
    for j in range(29,33):
        if(isinstance(X_test_con[i][j],six.string_types)):
            str = X_test_con[i][j].split('/')
            X_test_con[i][j] = int(str[0]) + 12*int(str[1])

In [8]:
imputer = Imputer()
X_train_con = imputer.fit_transform(X_train_con)
X_test_con = imputer.fit_transform(X_test_con)

In [9]:
scale(X_train_con, copy=False);
scale(X_test_con, copy=False);

## 2) Données catégorielles

In [14]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [21]:
X_train_cat = df[f_cat].values
X_test_cat = df_test[f_cat].values

for i in range(n_samples):
    for j in range(len(f_cat)):
        if not isinstance(X_train_cat[i][j], six.string_types):
            X_train_cat[i][j] = '(MISSING)'
            
for i in range(X_test_cat.shape[0]):
    for j in range(len(f_cat)):
        if not isinstance(X_test_cat[i][j], six.string_types):
            X_test_cat[i][j] = '(MISSING)'

In [22]:
encoder = MultiColumnLabelEncoder()
X_cat = encoder.fit_transform(pd.DataFrame(np.concatenate([X_train_cat,X_test_cat], axis=0),columns=f_cat)).values

In [23]:
for i in range(len(f_cat)):
    print("{} : {} {}".format(i,f_cat[i],np.max(X_cat[:,i])))

0 : VOIE_DEPOT 1
1 : COUNTRY 94
2 : SOURCE_BEGIN_MONTH 2
3 : FISRT_APP_COUNTRY 143
4 : FISRT_APP_TYPE 4
5 : LANGUAGE_OF_FILLING 29
6 : FIRST_CLASSE 41178
7 : TECHNOLOGIE_SECTOR 4
8 : TECHNOLOGIE_FIELD 34
9 : MAIN_IPC 629
10 : FISRT_INV_COUNTRY 150
11 : FISRT_INV_TYPE 4
12 : SOURCE_CITED_AGE 1
13 : SOURCE_IDX_ORI 1
14 : SOURCE_IDX_RAD 1


In [12]:
X_train_cat = X_cat[0:n_samples,:]
X_test_cat = X_cat[n_samples:,:]

In [15]:
X_train = np.concatenate((X_train_con,X_train_cat), axis=1)
X_test = np.concatenate((X_test_con,X_test_cat), axis=1)

pd.DataFrame(data=X_train, columns=f_con + f_cat).to_csv(path_or_buf='../data/train_1.csv', sep=';')
pd.DataFrame(data=X_test, columns=f_con + f_cat).to_csv(path_or_buf='../data/test_1.csv', sep=';')

#### Sans FIRST_CLASSE :

In [13]:
f_cat_2 = ['VOIE_DEPOT','COUNTRY','SOURCE_BEGIN_MONTH','FISRT_APP_COUNTRY','FISRT_APP_TYPE','LANGUAGE_OF_FILLING',
        'TECHNOLOGIE_SECTOR','TECHNOLOGIE_FIELD','MAIN_IPC','FISRT_INV_COUNTRY','FISRT_INV_TYPE','SOURCE_CITED_AGE',
        'SOURCE_IDX_ORI','SOURCE_IDX_RAD']

In [14]:
X_train_cat_2 = df[f_cat_2].values
X_test_cat_2 = df_test[f_cat_2].values

In [15]:
encoder = MultiColumnLabelEncoder()
X_cat = encoder.fit_transform(pd.DataFrame(np.concatenate([X_train_cat_2,X_test_cat_2], axis=0),columns=f_cat_2)).values

In [16]:
for i in range(len(f_cat_2)):
    print("{} : {} {}".format(i,f_cat[i],np.max(X_cat[:,i])))

0 : VOIE_DEPOT 1
1 : COUNTRY 94
2 : SOURCE_BEGIN_MONTH 2
3 : FISRT_APP_COUNTRY 143
4 : FISRT_APP_TYPE 4
5 : LANGUAGE_OF_FILLING 29
6 : FIRST_CLASSE 4
7 : TECHNOLOGIE_SECTOR 34
8 : TECHNOLOGIE_FIELD 629
9 : MAIN_IPC 150
10 : FISRT_INV_COUNTRY 4
11 : FISRT_INV_TYPE 1
12 : SOURCE_CITED_AGE 1
13 : SOURCE_IDX_ORI 1


In [19]:
X_train_cat_2 = X_cat[0:n_samples,:]
X_test_cat_2 = X_cat[n_samples:,:]

In [21]:
X_train = np.concatenate((X_train_con,X_train_cat_2), axis=1)
X_test = np.concatenate((X_test_con,X_test_cat_2), axis=1)

pd.DataFrame(data=X_train, columns=f_con + f_cat_2).to_csv(path_or_buf='../data/train_2.csv', sep=';')
pd.DataFrame(data=X_test, columns=f_con + f_cat_2).to_csv(path_or_buf='../data/test_2.csv', sep=';')

#### FIRST_CLASSE tronquée :

In [24]:
X_train_cat = df[f_cat].values
X_test_cat = df_test[f_cat].values

In [25]:
cnt = defaultdict(lambda: 0)

In [26]:
for i in range(X_train_cat.shape[0]):
    cnt[X_train_cat[i][6]] += 1
for i in range(X_test_cat.shape[0]):
    cnt[X_test_cat[i][6]] += 1

In [28]:
stuff = dict(sorted(cnt.items(), key=operator.itemgetter(1), reverse=True)[:10000])

In [29]:
for i in range(X_train_cat.shape[0]):
    if(not X_train_cat[i][6] in stuff):
        X_train_cat[i][6] = '(MISSING)'
for i in range(X_test_cat.shape[0]):
    if(not X_test_cat[i][6] in stuff):
        X_test_cat[i][6] = '(MISSING)'

In [30]:
print(X_train_cat[0:20,6])

['A61K9/48' 'C08G65/26' '(MISSING)' 'F25B41/04' 'H01F17/06' 'A01N25/34'
 'H04N7/24' '(MISSING)' 'H05K3/34' 'A61L27/00' 'G06F17/60' 'H04N13/00'
 '(MISSING)' 'A61K38/16' 'G02B6/44' 'B30B9/30' 'G01R33/3415' 'B63B21/50'
 'H01F41/02' '(MISSING)']


In [32]:
for i in range(n_samples):
    for j in range(len(f_cat)):
        if not isinstance(X_train_cat[i][j], six.string_types):
            X_train_cat[i][j] = '(MISSING)'
            
for i in range(X_test_cat.shape[0]):
    for j in range(len(f_cat)):
        if not isinstance(X_test_cat[i][j], six.string_types):
            X_test_cat[i][j] = '(MISSING)'

In [33]:
encoder = MultiColumnLabelEncoder()
X_cat = encoder.fit_transform(pd.DataFrame(np.concatenate([X_train_cat,X_test_cat], axis=0),columns=f_cat)).values

In [34]:
X_train_cat = X_cat[0:n_samples,:]
X_test_cat = X_cat[n_samples:,:]

In [35]:
X_train = np.concatenate((X_train_con,X_train_cat), axis=1)
X_test = np.concatenate((X_test_con,X_test_cat), axis=1)

pd.DataFrame(data=X_train, columns=f_con + f_cat).to_csv(path_or_buf='data/train_6.csv', sep=';')
pd.DataFrame(data=X_test, columns=f_con + f_cat).to_csv(path_or_buf='data/test_6.csv', sep=';')