In [44]:
import pandas as pd
import os

current_dir = os.getcwd()
root_path = os.path.join(current_dir, os.pardir, 'data', 'raw')
train_path = os.path.join(root_path, 'train.csv')
test_path = os.path.join(root_path, 'test.csv')

train = pd.read_csv(train_path)

In [62]:
def one_hot(df, column):
    df = pd.concat([df,pd.get_dummies(df[column], prefix=column)],axis=1)
    df.drop([column],axis=1, inplace=True)
    return df

def preprocess(df):
    df = one_hot(df, 'feature_1')
    df = one_hot(df, 'feature_2')
    df = one_hot(df, 'feature_3')
    df['year'] = pd.to_datetime(df['first_active_month']).dt.year
    df['month'] = pd.to_datetime(df['first_active_month']).dt.month
    df['year'].fillna(2017, inplace =True)
    df['month'].fillna(df['month'].mean(), inplace =True)
    df.drop(['first_active_month'],axis=1, inplace=True)
    df.drop(['card_id'],axis=1, inplace=True)  

    return df

In [46]:
train.sample(frac=1).reset_index(drop=True)

train = preprocess(train)

y = train.pop('target').values
X = train.values

In [47]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import linear_model
import math 

regr = linear_model.LinearRegression()

math.sqrt(-cross_val_score(regr, X, y, scoring='neg_mean_squared_error' , cv=8, n_jobs=8).mean())

3.844234520180109

In [83]:
from sklearn import ensemble

regr = ensemble.RandomForestRegressor(n_estimators=25)

math.sqrt(-cross_val_score(regr, X, y, scoring='neg_mean_squared_error' , cv=20, n_jobs=8).mean())

3.847732982496437

In [48]:
test = pd.read_csv(test_path)

In [63]:
test_labeless = preprocess(test)
X_test = test_labeless.values
regr.fit(X, y)

In [81]:
import csv

predictions = regr.predict(X_test)
output =  os.path.join(current_dir, os.pardir, 'data', 'processed', '.csv')
with open(output, 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['card_id','target'])
    for i, row in test.iterrows(): 
        writer.writerow([row['card_id'], predictions[i]])

