In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn library for preprocessing
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)



In [2]:
# Read in csv and create arrays
users_train_raw = pd.read_csv('../zip_files/train_users_2.csv.zip')
sessions_raw = pd.read_csv('../zip_files/sessions.csv.zip')
demographics = pd.read_csv('../zip_files/age_gender_bkts.csv.zip')
countries = pd.read_csv('../zip_files/countries.csv.zip')
test = pd.read_csv('../zip_files/test_users.csv.zip')

# Shuffle data
np.random.seed(0)
shuffle = np.random.permutation(np.arange(users_train_raw.shape[0]))
len(shuffle)
x = users_train_raw.reindex(shuffle)

# encode all values in numbers 
y = pd.DataFrame()
for column in list(x):
    y[column] = pd.factorize(x[column], sort=True)[0]

# split out labels from data frame
data, labels = preprocessing.normalize(np.asarray(y)[:,:-1]), np.asarray(y)[:,-1]

# Split into train and dev.
dev_data, dev_labels = data[:25000], labels[:25000]
train_data, train_labels = data[25000:], labels[25000:]

#Assign label name
label_names = np.unique(x["country_destination"])



In [4]:
# Logistic Regression Model
logistic_params = {'C': [c * 0.01 for c in range(1,11)]}
logistic_grid = GridSearchCV(LogisticRegression(), logistic_params)
logistic_grid.fit(train_data, train_labels)
logistic_best_param = logistic_grid.best_params_


In [5]:
logistic_best_param = logistic_grid.best_params_

In [6]:
logistic_best_param

{'C': 0.1}

In [7]:
log_model = LogisticRegression(C=0.1)   # Use optimal C from above
log_model.fit(train_data, train_labels)
prediction = log_model.predict(dev_data)
accuracy = log_model.score(dev_data,dev_labels)

In [8]:
accuracy

0.59484000000000004