In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk import word_tokenize

from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
train = pd.read_csv(open('data/manually_categorized/actor_classification_train.csv','rU'),
                    engine='python', sep=",", quoting=1)

In [4]:
train.head()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
0,Guy,ZZ0,en,394,14626,122072,"Martial arts, contortion, 7-string elec violin...",122030,http://www.twitter.com/ZZ0,745.0,False,person,0
1,party here,zxynisgod,es,75357,169818,44087,���I hate One Direction.�� -people who have lo...,72756,http://www.twitter.com/zxynisgod,327.0,False,person,1
2,?��,Zxntio,en,24372,38662,118,@rantzantio,119602,http://www.twitter.com/Zxntio,5.0,False,business,1
3,�_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_,zxkia,en-gb,9874,119158,127928,Don't take me seriously. || Turn off rts & tur...,197890,http://www.twitter.com/zxkia,170.0,False,person,0
4,,Zxkia,en,94,5514,12563,Somewhere between I want it and I got it. ~ Pr...,24316,http://twitter.com/Zxkia,,,,0


In [5]:
train.tail()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
20246,DOSE,___Dose___,en,8,20194,12,"A Strong Dose of Amazing People, Places, and T...",421003,http://www.twitter.com/___Dose___,2949,False,person,0
20247,One Direction News,_______1d_4ever,en,0,18578,37552,All the latest One Direction news from around ...,46205,http://www.twitter.com/_______1d_4ever,46,False,person,0
20248,Tyrne Clark,10223335,en,467,13481,60692,Shouldn't a strange and wonderful world be ful...,58620,http://www.twitter.com/10223335,54,False,person,1
20249,1776,1776,en,6586,9746,1293,Global incubator & seed fund helping startups ...,87459,http://www.twitter.com/1776,1125,False,person,0
20250,350 dot org,350,en,1153,25876,19502,Join a global movement that's inspiring the wo...,266424,http://www.twitter.com/350,5870,True,person,0


In [6]:
train.describe()

Unnamed: 0,favourites_count,statuses_count,friends_count,followers_count,listed_count,manual_segment
count,19849.0,20208.0,20203.0,20216.0,20060.0,20251.0
mean,6691.346718,43963.012371,25274.769242,253450.126336,1416.941575,0.533159
std,21815.667141,86924.189569,54316.6752,1394768.599471,6656.891966,0.498912
min,0.0,1.0,-1356.0,3.0,0.0,0.0
25%,88.0,5959.75,458.0,54378.5,107.0,0.0
50%,851.0,17867.0,2435.0,81594.5,359.0,1.0
75%,4434.0,46979.0,33536.0,156320.25,1198.0,1.0
max,685477.0,2372600.0,1004606.0,77803396.0,626947.0,1.0


# Feature Engineering

## Non-relevant columns

In [7]:
train = train.drop(["segment"], axis=1)
train = train.drop(["link"], axis=1)

## Boolean fields

In [8]:
list(set(train.verified))

[False, nan, True]

In [9]:
len(train[train.verified.isnull()])

425

In [10]:
train.ix[train.verified.isnull(), 'verified'] = False
train.ix[train.verified == True,  'verified'] = 1
train.ix[train.verified == False, 'verified'] = 0

In [11]:
list(set(train.verified))

[0, 1]

## Text columns

### OneHotEncoding for lang

In [12]:
# Simple manual OHE
if "lang" in train:
    train.ix[(train.lang == 'Select Language...') | (train.lang.isnull()), 'lang'] = None
    for lang in list(set(train.lang)):
        if lang != None:
            train.ix[train.lang == lang, "lang_"+lang] = 1
            train.ix[train.lang != lang, "lang_"+lang] = 0
    train = train.drop(["lang"], axis=1)

### Treat special characters

In [13]:
text_fields = ["name", "screen_name","summary"]

def treat_special_char(c):
    try:
        return '0' if c.isdigit() else c.decode().encode("utf-8")
    except UnicodeDecodeError:
        return '9'

for field in text_fields:
    train.ix[train[field].isnull(), field] = "null"
    train[field] = map(lambda n: ''.join(map(lambda c: treat_special_char(c), list(n))), train[field].values)
    
train[text_fields].head()

Unnamed: 0,name,screen_name,summary
0,Guy,ZZ0,"Martial arts, contortion, 0-string elec violin..."
1,party here,zxynisgod,999I hate One Direction.999 -people who have l...
2,?99,Zxntio,@rantzantio
3,99_99_99_99_99_99_99_99_99_99_99_99_99_99_99_9...,zxkia,Don't take me seriously. || Turn off rts & tur...
4,,Zxkia,Somewhere between I want it and I got it. ~ Pr...


### CountVectorizer for 'screen_name' and 'name'

In [14]:
def num_char_tokenizer(text):
    return list(text)

for field in ["screen_name","name"]:
    if field in train:

        field_countvect = CountVectorizer(tokenizer=num_char_tokenizer,
                                          ngram_range=(3, 5), 
                                          analyzer="char",
                                          min_df = 8)

        field_matrix = field_countvect.fit_transform(train[field])
        features_names = map(lambda f: "_".join([field,f]), field_countvect.get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)
        print(field_matrix.shape)

        train = pd.concat([train, field_df], axis=1, join='inner').drop([field], axis=1)
        print(train.shape)

(20251, 9631)
(20251, 9672)
(20251, 12459)
(20251, 22130)


### CountVectorizer for 'summary'

In [15]:
def num_word_tokenizer(text):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(text)

if "summary" in train:

    summary_countvect = CountVectorizer(tokenizer=num_word_tokenizer,
                                        ngram_range=(2, 4), 
                                        analyzer="word",
                                        min_df = 5)

    summary_matrix = summary_countvect.fit_transform(train.summary)
    features_names = map(lambda f: "_".join(["summary",f]), summary_countvect.get_feature_names())
    summary_df = pd.DataFrame(summary_matrix.A, columns=features_names)
    print(summary_matrix.shape)
    train = pd.concat([train, summary_df], axis=1, join='inner').drop(["summary"], axis=1)
    print(train.shape)

(20251, 8560)
(20251, 30689)


In [16]:
train = train.fillna(0)

# Training the model

## Logistic Regression

In [17]:
outcome = "manual_segment"

features = list(set(train.columns) - set([outcome]))

In [None]:
k_fold = KFold(n=len(train), n_folds=10, indices=False, shuffle=True)
b_scores, svc_scores = [], []

for tr_indices, cv_indices in k_fold:
    tr   = np.asarray(train[tr_indices][features])
    tr_y = np.asarray(train[tr_indices][outcome])

    cv   = np.asarray(train[cv_indices][features])
    cv_y = np.asarray(train[cv_indices][outcome])

    model = LogisticRegression(C=1e5)
    model.fit(tr, tr_y)

    pred_y = model.predict(cv)
    
    print(confusion_matrix(cv_y, pred_y))
    print('score:' + str(model.score(cv, cv_y)))
    
# model = linear_model.LogisticRegression(C=1e5)
# model.fit(train[features], train[outcome])

[[308 635]
 [202 881]]
score:0.586870681145
[[348 587]
 [260 830]]
score:0.581728395062
[[347 582]
 [254 842]]
score:0.587160493827
[[334 612]
 [236 843]]
score:0.581234567901
[[345 647]
 [179 854]]
score:0.592098765432
[[306 644]
 [250 825]]
score:0.558518518519
[[301 627]
 [219 878]]
score:0.582222222222
[[326 601]
 [242 856]]
score:0.583703703704
[[299 624]
 [217 885]]
score:0.584691358025
[[322 659]
 [223 821]]
score:0.564444444444


  stacklevel=1)


## Random Forest

In [None]:
k_fold = KFold(n=len(train), n_folds=10, indices=False, shuffle=True)
b_scores, svc_scores = [], []

for tr_indices, cv_indices in k_fold:
    tr   = np.asarray(train[tr_indices][features])
    tr_y = np.asarray(train[tr_indices][outcome])

    cv   = np.asarray(train[cv_indices][features])
    cv_y = np.asarray(train[cv_indices][outcome])

    model = RandomForestClassifier(n_estimators=25)
    model.fit(tr, tr_y)

    print(confusion_matrix(cv_y, model.predict(cv)))    
    print('score:' + str(model.score(cv, cv_y)))
    
# model = RandomForestClassifier(n_estimators=25)
# model.fit(train[features], train[outcome])

[[667 282]
 [219 858]]
score:0.752714708786
[[704 272]
 [206 843]]
score:0.763950617284
[[697 258]
 [205 865]]
score:0.771358024691
[[654 264]
 [188 919]]
score:0.776790123457
[[659 281]
 [178 907]]
score:0.773333333333
[[679 278]
 [182 886]]
score:0.772839506173
[[701 271]
 [211 842]]
score:0.761975308642
[[687 271]
 [203 864]]
score:0.765925925926

In [18]:
k_fold = KFold(n=len(train), n_folds=10, indices=False, shuffle=True)
b_scores, svc_scores = [], []

for tr_indices, cv_indices in k_fold:
    tr   = np.asarray(train[tr_indices][features])
    tr_y = np.asarray(train[tr_indices][outcome])

    cv   = np.asarray(train[cv_indices][features])
    cv_y = np.asarray(train[cv_indices][outcome])

    model = RandomForestClassifier(n_estimators=40)
    model.fit(tr, tr_y)

    print(confusion_matrix(cv_y, model.predict(cv)))    
    print('score:' + str(model.score(cv, cv_y)))
    
# model = RandomForestClassifier(n_estimators=25)
# model.fit(train[features], train[outcome])

[[651 262]
 [208 905]]
score:0.768015794669
[[693 280]
 [208 844]]
score:0.759012345679
[[667 259]
 [207 892]]
score:0.76987654321
[[706 259]
 [178 882]]
score:0.784197530864
[[676 255]
 [207 887]]
score:0.771851851852
[[640 290]
 [223 872]]
score:0.746666666667
[[668 257]
 [212 888]]
score:0.768395061728
[[709 271]
 [189 856]]
score:0.772839506173
[[677 281]
 [200 867]]
score:0.762469135802
[[689 264]
 [192 880]]
score:0.774814814815


  stacklevel=1)
