In [431]:
import os
import fnmatch
import glob
import json

import numpy as np
import pandas as pd
import chardet
import gc
import matplotlib.pyplot as plt

from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Loading data

In [409]:
input_dir = 'data/manually_categorized/'
input_prefix = 'actor_classification_train'
train = None
for file in os.listdir(input_dir):
  if fnmatch.fnmatch(file, input_prefix+'*.csv'):
    if train is None:
      print "==> Initializing input dataframe: "
      train = pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)
    else:
      print "==> Concatenating dataframe from " + file + ": "
      train = pd.concat([train, pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)])
    train.drop_duplicates(inplace=True)
    print train.shape

==> Initializing input dataframe: 
(20245, 13)
==> Concatenating dataframe from actor_classification_train_copy.csv: 
(20245, 13)


In [410]:
train.head()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
0,Guy,ZZ0,en,394,14626,122072,"Martial arts, contortion, 7-string elec violin...",122030,http://www.twitter.com/ZZ0,745.0,False,person,0
1,party here,zxynisgod,es,75357,169818,44087,���I hate One Direction.�� -people who have lo...,72756,http://www.twitter.com/zxynisgod,327.0,False,person,1
2,?��,Zxntio,en,24372,38662,118,@rantzantio,119602,http://www.twitter.com/Zxntio,5.0,False,business,1
3,�_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_,zxkia,en-gb,9874,119158,127928,Don't take me seriously. || Turn off rts & tur...,197890,http://www.twitter.com/zxkia,170.0,False,person,0
4,,Zxkia,en,94,5514,12563,Somewhere between I want it and I got it. ~ Pr...,24316,http://twitter.com/Zxkia,,,,0


In [411]:
train.tail()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
20246,DOSE,___Dose___,en,8,20194,12,"A Strong Dose of Amazing People, Places, and T...",421003,http://www.twitter.com/___Dose___,2949,False,person,0
20247,One Direction News,_______1d_4ever,en,0,18578,37552,All the latest One Direction news from around ...,46205,http://www.twitter.com/_______1d_4ever,46,False,person,0
20248,Tyrne Clark,10223335,en,467,13481,60692,Shouldn't a strange and wonderful world be ful...,58620,http://www.twitter.com/10223335,54,False,person,1
20249,1776,1776,en,6586,9746,1293,Global incubator & seed fund helping startups ...,87459,http://www.twitter.com/1776,1125,False,person,0
20250,350 dot org,350,en,1153,25876,19502,Join a global movement that's inspiring the wo...,266424,http://www.twitter.com/350,5870,True,person,0


In [412]:
train.describe()

Unnamed: 0,favourites_count,statuses_count,friends_count,followers_count,listed_count,manual_segment
count,19844.0,20202.0,20197.0,20210.0,20054.0,20245.0
mean,6692.418766,43975.199733,25278.933951,253485.692528,1416.983445,0.533169
std,21818.175658,86934.198698,54323.16627,1394969.623505,6657.741655,0.498911
min,0.0,1.0,-1356.0,3.0,0.0,0.0
25%,88.0,5968.25,458.0,54386.75,107.0,0.0
50%,851.0,17870.5,2436.0,81605.0,359.0,1.0
75%,4435.0,46988.0,33541.0,156324.75,1198.0,1.0
max,685477.0,2372600.0,1004606.0,77803396.0,626947.0,1.0


# Exploratory Data Analysis

In [413]:
train["summary"] = train["summary"].fillna("")

In [414]:
relevance_margin = 25
relevance_count = 10

summary_words = ["organization", "institute", "institution", 
                 "my ", "myself", 
                 "financial", "money", "social",
                 "family", "husband", "wife", "father", "mother", "kids", "children", 
                 "entrepreneur ", "scientist", "CEO", "CTO ", "CPO",
                 "chief", "leader", "industry", "engineer ", "musician ", "piano ", "guitar"]

for word in summary_words:
    counts = train[train['summary'].str.contains(word)]["manual_segment"].value_counts()
    n_is_person = counts[1]
    p_is_person = round(float(n_is_person)/sum(counts)*100,1)
    
    if (p_is_person > (100 - relevance_margin) or p_is_person < relevance_margin) and n_is_person >= relevance_count:
        print "When summary contains {} {} ({}%) are person".format(word, n_is_person, p_is_person)


When summary contains my  958 (80.4%) are person
When summary contains myself 29 (93.5%) are person
When summary contains husband 61 (92.4%) are person
When summary contains wife 45 (95.7%) are person
When summary contains father 79 (90.8%) are person
When summary contains mother 29 (93.5%) are person
When summary contains entrepreneur  33 (86.8%) are person
When summary contains scientist 15 (78.9%) are person
When summary contains CEO 317 (86.8%) are person
When summary contains chief 13 (76.5%) are person
When summary contains guitar 20 (87.0%) are person


# Feature Engineering

## Non-relevant columns

In [415]:
train = train.drop(["segment"], axis=1)
train = train.drop(["link"], axis=1)

## Boolean fields

In [416]:
list(set(train.verified))

[False, nan, True]

In [417]:
len(train[train.verified.isnull()])

424

In [418]:
train.ix[train.verified.isnull(), 'verified'] = False
train.ix[train.verified == True,  'verified'] = 1
train.ix[train.verified == False, 'verified'] = 0

In [419]:
list(set(train.verified))

[0, 1]

## Text columns

### OneHotEncoding for lang

In [420]:
# Simple manual OHE
if "lang" in train:
    train.ix[(train.lang == 'Select Language...') | (train.lang.isnull()), 'lang'] = None
    for lang in list(set(train.lang)):
        if lang != None:
            train.ix[train.lang == lang, "lang_"+lang] = 1
            train.ix[train.lang != lang, "lang_"+lang] = 0
    train.drop(["lang"], axis=1, inplace=True)

### Treat special characters

In [421]:
text_fields = ["name", "screen_name","summary"]

def treat_special_char(c):
    try:
        encoding = chardet.detect(str(c))['encoding'] or "KOI8-R"
        return '0' if c.isdigit() else c.decode(encoding)
    except UnicodeDecodeError:        
        return '9'

for field in text_fields:
    train.ix[train[field].isnull(), field] = "null"
    train[field] = map(lambda n: ''.join(map(lambda c: treat_special_char(c), list(n))), train[field].values)
    
train[text_fields].head()
gc.collect()

1298

### CountVectorizer for 'screen_name' and 'name'

In [422]:
def num_char_tokenizer(text):
    return list(text)

for field in ["screen_name","name"]:
    if field in train:

        field_tfidf = TfidfVectorizer(tokenizer=num_char_tokenizer,
                                      ngram_range=(3, 5), 
                                      analyzer="char",
                                      binary=True, #False
                                      min_df = 50) #8

        field_matrix = field_tfidf.fit_transform(train[field])
        features_names = map(lambda f: "_".join([field,f]), field_tfidf.get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)
        print(field_matrix.shape)
        gc.collect()

        train = pd.concat([train, field_df], axis=1, join='inner')
        gc.collect()
        train.drop([field], axis=1, inplace=True)
        gc.collect()
        print(train.shape)

(20245, 1107)
(20239, 1148)
(20239, 1581)
(20233, 2728)


### CountVectorizer for 'summary'

In [423]:
if "summary" in train:

    summary_tfidf = TfidfVectorizer(token_pattern=r'\w+',
                                    ngram_range=(1, 4), 
                                    analyzer="word",
                                    binary=True, #False
                                    sublinear_tf=True, 
                                    stop_words='english',
                                    min_df = 50) #5

    summary_matrix = summary_tfidf.fit_transform(train.summary)
    features_names = map(lambda f: "_".join(["summary",f]), summary_tfidf.get_feature_names())
    summary_df = pd.DataFrame(summary_matrix.A, columns=features_names)
    print(summary_matrix.shape)
    train = pd.concat([train, summary_df], axis=1, join='inner').drop(["summary"], axis=1)
    print(train.shape)

(20233, 748)
(20227, 3475)


In [424]:
train.fillna(0, inplace=True)

### Persist Enriched Data

In [425]:
train.to_csv('data/manually_categorized/enriched-actor_classification_train.csv', index=False, encoding="utf-8")
joblib.dump(train.columns, 'data/manually_categorized/actor_classification_random_forest_features_20151124.csv', compress=9)

['data/manually_categorized/actor_classification_random_forest_features_20151124.csv']

# Feature selection

In [426]:
outcome = "manual_segment"

features = list(set(train.columns) - set([outcome]))

In [427]:
# # Build estimator from PCA and Univariate selection:
# combined_features = FeatureUnion([("pca", PCA(n_components=2)), 
#                                   ("univ_select", SelectKBest(k=1))])

# tr   = np.asarray(train[features])
# tr_y = np.asarray(train[outcome])

# # Use combined features to transform dataset
# X_features = combined_features.fit(tr, tr_y).transform(tr)

# # Random Forest model
# rfmodel = RandomForestClassifier(n_estimators=25)

# # Do grid search over k, n_components and C:
# pipeline = Pipeline([("features", combined_features), ("rf", rfmodel)])

# param_dist = dict(features__pca__n_components=range(50, 500),
#                   features__univ_select__k=range(50, 500))

# random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, verbose=10,
#                                    n_iter=200)
# random_search.fit(tr, tr_y)
# print(random_search.best_estimator_)

# Training the model

## Random Forest

In [428]:
# KFold cross validation setup
k_fold = KFold(n=len(train), n_folds=4, indices=False, shuffle=True)
b_scores, svc_scores = [], []

for tr_indices, cv_indices in k_fold:
    tr   = np.asarray(train[tr_indices][features])
    tr_y = np.asarray(train[tr_indices][outcome])
    
    cv   = np.asarray(train[cv_indices][features])
    cv_y = np.asarray(train[cv_indices][outcome])
    
    # StandardScaler
    scaler = StandardScaler().fit(tr)    
    tr = scaler.transform(tr)

    # Random Forest model
    rfmodel = RandomForestClassifier(n_estimators=100)
    rfmodel.fit(tr, tr_y)

    # Validate
    cv = scaler.transform(cv)
    print(confusion_matrix(cv_y, rfmodel.predict(cv)))    
    print('score:' + str(rfmodel.score(cv, cv_y)))
    
rfmodel = RandomForestClassifier(n_estimators=100)
rfmodel.fit(train[features], train[outcome])

[[1354  960]
 [ 841 1902]]
score:0.643859996045
[[1365 1043]
 [ 762 1887]]
score:0.643069013249
[[1390  999]
 [ 764 1904]]
score:0.651374332608
[[1342  989]
 [ 798 1927]]
score:0.646558544304


  stacklevel=1)


## Persist model and columns

In [None]:
model_path = "data/manually_categorized/actor_classification_random_forest_20151129.pkl"
model_features_path = "data/manually_categorized/actor_classification_random_forest_features_20151129.pkl"

joblib.dump(rfmodel, model_path, compress=9)
joblib.dump(train.columns, model_features_path, compress=9)

# Test

In [None]:
test_model = joblib.load(glob.glob(model_path))
model_features = joblib.load(glob.glob(model_features_path))

## Loading data

In [None]:
test_data = ['{"name":"Светлана Петухова","screen_name":"svpetuhova26623","summary":"","lang":"ru","favourites_count":23,"statuses_count":14,"friends_count":2,"followers_count":1,"listed_count":1,"verified":0}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}']

In [None]:
test_data = [json.loads(t) for t in test_data]

In [None]:
test_data = pd.DataFrame(test_data)

## Feature Engineering

In [None]:
test_data.ix[test_data.verified.isnull(), 'verified'] = False
test_data.ix[test_data.verified == True,  'verified'] = 1
test_data.ix[test_data.verified == False, 'verified'] = 0

# 'lang'
for lang_field in filter(lambda f: f.startswith("lang_"), self.model_features):
  test_data[lang_field] = (1 if lang_field == "lang_"+(test_data["lang"]).values[0] else 0)
del test_data["lang"]

# Treat special characters
text_fields = ["name", "screen_name","summary"]

def treat_special_char(c):
  try:
    encoding = chardet.detect(str(c))['encoding'] or "KOI8-R"
    return '0' if c.isdigit() else c.decode(encoding)
  except:
    return '9'

for field in text_fields:
  test_data.ix[test_data[field].isnull(), field] = "null"
  test_data[field] = map(lambda n: ''.join(map(lambda c: treat_special_char(c), list(n))), test_data[field].values)

# TfidfVectorizer for 'screen_name' and 'name'
def num_char_tokenizer(text):
  return list(text)

for field in ["screen_name","name"]:
  if field in test_data:
    vocabulary = [f.replace(field+"_", "") for f in self.model_features if f.startswith(field+"_")]
    field_tfidf = TfidfVectorizer(tokenizer=num_char_tokenizer,
                                  ngram_range=(3, 5), 
                                  analyzer="char",
                                  vocabulary = vocabulary)

    field_matrix = field_tfidf.fit_transform(test_data[field])
    features_names = map(lambda f: "_".join([field,f]), field_tfidf.get_feature_names())
    field_df = pd.test_DataFrame(field_matrix.A, columns=features_names)
    gc.collect()
    test_data = pd.concat([test_data, field_df], axis=1, join='inner')
    del test_data[field]
    gc.collect()

# TfidfVectorizer for 'summary'
if "summary" in test_data:
  vocabulary = [f.replace("summary_", "") for f in self.model_features if f.startswith("summary_")]
  summary_tfidf = TfidfVectorizer(token_pattern=r'\w+',
                                  ngram_range=(1, 4), 
                                  analyzer="word",
                                  binary=True, #False 
                                  stop_words='english',
                                  vocabulary = vocabulary)

  summary_matrix = summary_tfidf.fit_transform(test_data.summary)
  features_names = map(lambda f: "_".join(["summary",f]), summary_tfidf.get_feature_names())
  summary_df = pd.test_DataFrame(summary_matrix.A, columns=features_names)
  gc.collect()
  test_data = pd.concat([test_data, summary_df], axis=1, join='inner')
  del test_data["summary"]
  gc.collect()

# Treat remaining null values
test_data.fillna(0, inplace=True)
gc.collect()

## Predict

In [None]:
test_model.predict_proba(test_data)