In [1]:
import sys
import csv
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer
from sklearn import svm

In [2]:
def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    
    Modified from: https://gist.github.com/kljensen/5452382
    
    Details:
    
    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

In [3]:
# Read data
df = pd.read_csv('training.csv')

# Shuffle data (rows)
df = df.iloc[np.random.permutation(len(df))]

df = df.head(30000)

In [4]:
# Explore columns
#len(df.SubModel.unique())
#print df.AUCGUART.isnull().sum()
#print df.PRIMEUNIT.isnull().sum()
print df.groupby('IsBadBuy').count()['RefId']

IsBadBuy
0    26369
1     3631
Name: RefId, dtype: int64


In [5]:
# Drop columns
cols_to_drop = ['RefId', 'PurchDate']#, 'Color', 'WheelTypeID', 'PRIMEUNIT', 'AUCGUART'] 
df = df.drop(cols_to_drop, axis=1)

In [6]:
# Get labels
Y = df['IsBadBuy'].values.tolist()
df = df.drop('IsBadBuy', axis=1)

In [7]:
# Normalize and pre-process data
df1 = df.select_dtypes(include=['int64', 'float64'])
df2 = df.select_dtypes(exclude=['int64', 'float64'])

df1_norm = (df1 - df1.min()) / (df1.max() - df1.min())

df2_onehot = encode_onehot(df2, df2.columns)

frames = [df1_norm, df2_onehot]
X = pd.concat(frames, axis = 1)
X = X.values
X = Imputer().fit_transform(X)
X = X.tolist()

In [8]:
# Training/testing split
split = 20000
X_train = X[:split]
X_test = X[split:]
Y_train = Y[:split]
Y_test = Y[split:]

In [9]:
#clf = DecisionTreeClassifier()
#clf = clf.fit(X_train, Y_train)

#clf = LogisticRegression()
#clf = clf.fit(X_train, Y_train)

clf = svm.SVC()
clf = clf.fit(X_train, Y_train)

In [10]:
Y_predict = clf.predict(X_test)
Y_predict = Y_predict.tolist()

In [12]:
wrong = 0
tp = 0
fp = 0
fn = 0

for idx, pred in enumerate(Y_predict):
    if pred == 1 and Y_test[idx] == 1:
        tp += 1
    if pred == 1 and Y_test[idx] == 0:
        fp += 1
    if pred == 0 and Y_test[idx] == 1:
        fn += 1
    if pred != Y_test[idx]:
        wrong += 1
   
print tp, fp, fn
err = wrong*1.0/len(Y_test)
precision = tp*1.0/(tp+fp)
recall = tp*1.0/(tp+fn)

print tp, fp, fn
print err, precision, recall

0 0 1259


ZeroDivisionError: float division by zero

In [18]:
df = pd.read_csv('dummy.csv')
df = df.corr()[['X1']]
df = df.sort_values(['X1'], 0, False)
df.to_csv('dummy_out.csv')

In [44]:
type(LogisticRegression())

sklearn.linear_model.logistic.LogisticRegression