In [329]:
import random
import os.path
import pandas as pd
import numpy as np

from pyspark.sql import SQLContext, Row
from pyspark.sql.types import *

import matplotlib
matplotlib.style.use('ggplot')

In [None]:
def exists(filepath):
    return os.path.exists(filepath)

# Split 48 million rows to 10 million(training) & 38 million(testing)

In [None]:
if (exists('train10M.txt') and exists('test38M.txt')):
    print "train10M.txt and test38M.txt already created before."
else:
    !split -l 2000000 train.txt ff
    !cat ffaa ffat ffaf ffaq ffaj > train10M.txt
    !rm ffaa ffat ffaf ffaq ffaj
    !cat ff* > train38M.txt
    !rm ff*
    print "Newly created files: train10M.txt and test38M.txt."

# Split training data into 3 partitions

In [None]:
if (exists('train5M.txt') and 
    exists('validation2M.txt') and 
    exists('test3M.txt')):
    print "train5M.txt, validation2M.txt, test3M.txt already created before."
else:
    !split -l 1000000 train10M.txt ff
    !cat ffaa ffaj ffad ffaf ffah > train5M.txt    
    !cat ffai ffae > validation2M.txt
    !cat ffac ffag ffab > test3M.txt
    !rm ff*
    print "Newly created files: train5M.txt, validation2M.txt, test3M.txt."

# Perform training and analysis with train5M data

In [404]:
df = pd.read_table('train5M.txt')
print df.shape

(4999999, 40)


# Remove and save the first column as y 

In [405]:
y = df['0']
df = df.drop('0', 1)

# Rename column for train5m data

In [407]:
new_col_names = ['f' + str(num) for num in range(1,len(df.columns)+1)]
df.columns = new_col_names

# Get summary stats for all integer columns

In [408]:
all_columns = list(df.columns)
integer_cols = all_columns[:13]
df[integer_cols].describe()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
count,2756813.0,4999999.0,3906401.0,3853777.0,4863896.0,3887221.0,4788996.0,4997037.0,4788996.0,2756813.0,4788996.0,1177668.0,3853777.0
mean,3.550075,98.595417,30.839373,7.429061,18112.661813,118.854952,16.559919,12.785998,110.211554,0.610537,2.751485,0.946656,8.385096
std,9.865513,385.372407,494.089496,9.02472,68140.20395,421.572801,71.788627,19.683659,229.168662,0.69087,5.243586,5.745028,21.412443
min,0.0,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,2.0,561.0,8.0,1.0,2.0,11.0,0.0,1.0,0.0,2.0
50%,1.0,2.0,6.0,4.0,2802.0,33.0,3.0,7.0,40.0,1.0,1.0,0.0,4.0
75%,3.0,34.0,17.0,10.0,10082.0,104.0,12.0,19.0,114.0,1.0,3.0,0.0,10.0
max,1900.0,257675.0,65535.0,563.0,23159456.0,367553.0,56311.0,5064.0,29019.0,9.0,165.0,1881.0,7393.0


# Create files for each histogram

In [None]:
def m_count_distinct(col):
    """
        count distinct values in all columns
        @param col: numpy 1D array
        @return: dictionary of value, freq
    """
    d = {}
    for x in list(col):
        if type(x) == np.float64 and np.isnan(x):
            d['NaN'] = d.get('NaN', 0) + 1
        else:
            d[x] = d.get(x, 0) + 1
    return d


# ==== create file from result of each freq count ===
for col in all_columns:
    filepath = 'histo-file/' + col + '.txt'
    if (exists(filepath)):
        print filepath + " already created."
    else:
        freq_count = m_count_distinct(df[col].values)
        with open(filepath, 'w') as f:
            for key, value in freq_count.items():
                row = str(key) + "," + str(value) + "\n"
                f.write(row)
        print "Newly created file:", filepath

# Plot histograms from stored files

In [None]:
for col in all_columns:
    filepath = "histo-file/" + col + ".txt"
    histo_df = pd.read_csv(filepath, header=None, names=["val", "freq"])
    ax = histo_df.plot(x='val', y='freq', logy='True', title=col, xlim=(0))
    fig = ax.get_figure()
    fig.savefig('second-histo-img/' + col + '.png')

# Normalize integer columns after transpose

In [436]:
df_norm = df[integer_cols].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
df_norm.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
0,0.001053,1.2e-05,0.000671,0.001776,4.404249e-06,2.2e-05,3.6e-05,0.000395,0.000138,0.111111,0.006061,,0.000541
1,0.001053,1.2e-05,1.5e-05,0.024867,3.311822e-05,0.000242,7.1e-05,0.000395,0.008443,0.111111,0.018182,0.001595,0.006087
2,,0.003477,,,0.0001896418,,0.0,0.0,0.0,,0.0,,
3,0.001579,8e-06,,0.0,8.635781e-08,0.0,5.3e-05,0.0,0.0,0.111111,0.006061,,0.0
4,,8e-06,,,0.0005537263,,0.0,0.0,0.000207,,0.0,,


# Select features to be used

In [461]:
cols_to_keep = all_columns[13:16]
cols_to_keep

['f14', 'f15', 'f16']

# One hot encoding

In [458]:
xx = df.iloc[:100000,:6]
print xx.shape
xx.head()

(100000, 6)


Unnamed: 0,f1,f2,f3,f4,f5,f6
0,2.0,0,44.0,1.0,102,8.0
1,2.0,0,1.0,14.0,767,89.0
2,,893,,,4392,
3,3.0,-1,,0.0,2,0.0
4,,-1,,,12824,


In [484]:
xx = df.iloc[:10000,13:39]
oh = pd.get_dummies(xx)
print oh.shape

(10000, 43574)


In [459]:
#cols_to_keep = all_columns[:13] + all_columns[17:24]
df_onehot = encode_onehot(xx, ['f1', 'f2', 'f3', 'f4', 'f5'])
print df_onehot.shape
df_onehot.head()



(100000, 6)


Unnamed: 0,f6,f1,f2,f3,f4,f5
0,8.0,2.0,0,44.0,1.0,102
1,89.0,2.0,0,1.0,14.0,767
2,,,893,,,4392
3,0.0,3.0,-1,,0.0,2
4,,,-1,,,12824


In [447]:
from sklearn.feature_extraction import DictVectorizer
 
def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    
    Modified from: https://gist.github.com/kljensen/5452382
    
    Details:
    
    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(outtype='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df


# Model and Evaluation

In [None]:
X = df_onehot
y = test_data.index.values

### Logistic Regression

In [None]:
from sklearn.linear_mode import LogisticRegression
model = LogisticRegression()
model.fit(X,y)

In [None]:
model.score(X,y)

### Split up the data set into test and training sets

In [None]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
testLogRModel = LogisticRegression()
testLogRModel.fit(x_train, y_train)

In [None]:
testLogRModel.score(X,y)

### Predict class labels for the test set

In [None]:
predicted = testLogRModel.predict(x_test)
print predicted

### AUC score from ROC

In [None]:
from sklearn import metrics
print "accuracy:", metrics.accuracy_score(y_test, predicted)
# print "auc:", metrics.roc_auc_score(y_test, predicted)

### Perform cross validation using n folds

In [None]:
from sklearn.cross_validation import cross_val_score
#n = 5
#scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy')
#print scores
#print scores.mean()

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_random_forest = RandomForestClassifier(n_estimators=10)
clf_random_forest.fit(X,y)

In [None]:
clf_random_forest.score(X,y)