In [97]:
import os
import re
import fnmatch
import json

import numpy as np
import pandas as pd
import chardet
import gc
import matplotlib.pyplot as plt

from pprint import pprint
from time import time
from scipy.stats import randint as sp_randint

import nltk

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.lda import LDA
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Loading data

In [98]:
input_dir = 'data/manually_categorized/'
input_prefix = 'actor_classification_train'
train = None
for file in os.listdir(input_dir):
  if fnmatch.fnmatch(file, input_prefix+'*.csv'):
    if train is None:
      print "==> Initializing input dataframe: "
      train = pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)
    else:
      print "==> Concatenating dataframe from " + file + ": "
      train = pd.concat([train, pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)])
    train.drop_duplicates(inplace=True)
    print train.shape

==> Initializing input dataframe: 
(20245, 13)
==> Concatenating dataframe from actor_classification_train_copy.csv: 
(20245, 13)


In [99]:
train.head()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
0,Guy,ZZ0,en,394,14626,122072,"Martial arts, contortion, 7-string elec violin...",122030,http://www.twitter.com/ZZ0,745.0,False,person,0
1,party here,zxynisgod,es,75357,169818,44087,���I hate One Direction.�� -people who have lo...,72756,http://www.twitter.com/zxynisgod,327.0,False,person,1
2,?��,Zxntio,en,24372,38662,118,@rantzantio,119602,http://www.twitter.com/Zxntio,5.0,False,business,1
3,�_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_,zxkia,en-gb,9874,119158,127928,Don't take me seriously. || Turn off rts & tur...,197890,http://www.twitter.com/zxkia,170.0,False,person,0
4,,Zxkia,en,94,5514,12563,Somewhere between I want it and I got it. ~ Pr...,24316,http://twitter.com/Zxkia,,,,0


In [100]:
train.tail()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
20246,DOSE,___Dose___,en,8,20194,12,"A Strong Dose of Amazing People, Places, and T...",421003,http://www.twitter.com/___Dose___,2949,False,person,0
20247,One Direction News,_______1d_4ever,en,0,18578,37552,All the latest One Direction news from around ...,46205,http://www.twitter.com/_______1d_4ever,46,False,person,0
20248,Tyrne Clark,10223335,en,467,13481,60692,Shouldn't a strange and wonderful world be ful...,58620,http://www.twitter.com/10223335,54,False,person,1
20249,1776,1776,en,6586,9746,1293,Global incubator & seed fund helping startups ...,87459,http://www.twitter.com/1776,1125,False,person,0
20250,350 dot org,350,en,1153,25876,19502,Join a global movement that's inspiring the wo...,266424,http://www.twitter.com/350,5870,True,person,0


# Feature Engineering

## Custom Transformer with LabelEncoder for 'verified'

In [101]:
class VerifiedTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X.verified.fillna(False, inplace=True)
        X.verified = LabelEncoder().fit_transform(X.verified)
        return X

In [102]:
verified_transformer = VerifiedTransformer()
verified_transformer.transform(train)

print train.verified.value_counts()

False    16913
True      2908
dtype: int64


## Custom OneHotEncoding for lang

In [103]:
class LangOneHotEncoding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        valid_langs = list(set(X.lang) - set([None, np.nan, 'Select Language...']))
        self.feature_names_ = ["lang_"+str(l) for l in valid_langs if type(l) == str]
        return self

    def transform(self, X, y=None):
        check_is_fitted(self, 'feature_names_')
        
        X = X.copy()
        X["lang"].fillna("", inplace=True)
        for lang_feature in self.feature_names_:
            X[lang_feature] = [(1 if lang_feature == "lang_"+v else 0) for v in X["lang"].values]
        
        X.drop(["lang"], axis=1, inplace=True)
        return X
    
lang_ohe = LangOneHotEncoding().fit(train)

## Replacing NAs on textual fields

In [104]:
class FillTextNA(BaseEstimator, TransformerMixin):

    def __init__(self, cols, replace_by=""):
        self.cols = cols
        self.replace_by = replace_by

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X[c].fillna(self.replace_by, inplace=True)
        return X

## DataFrameTfidfVectorizer for textual fields

In [105]:
class DataFrameTfidfVectorizer(TfidfVectorizer):

    def __init__(self, col, prefix=None, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):
        super(DataFrameTfidfVectorizer, self).__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self.col = col
        self.prefix = prefix or col
        
    def treat_special_char(self, c):
        try:
            encoding = chardet.detect(str(c))['encoding'] or "KOI8-R"
            return '0' if c.isdigit() else c.decode(encoding)
        except:        
            return '9'

    def treat_special_chars(self, col):
        col.fillna("null", inplace=True)
        col = [''.join([self.treat_special_char(c) for c in list(n)]) 
               for n in col.values]
        return col

    def fit(self, dataframe, y=None):
        dataframe = dataframe.copy()
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        super(DataFrameTfidfVectorizer, self).fit(dataframe[self.col])
        return self

    def fit_transform(self, dataframe, y=None):
        dataframe = dataframe.copy()
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        field_matrix = super(DataFrameTfidfVectorizer, self).fit_transform(dataframe[self.col])
        features_names = map(lambda f: "_".join([self.prefix,f]), super(DataFrameTfidfVectorizer, self).get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)

        dataframe = dataframe.join(field_df)

        return dataframe

    def transform(self, dataframe, copy=True):
        dataframe = dataframe.copy()
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        field_matrix = super(DataFrameTfidfVectorizer, self).transform(dataframe[self.col])
        features_names = map(lambda f: "_".join([self.prefix,f]), super(DataFrameTfidfVectorizer, self).get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)

        dataframe = dataframe.join(field_df)

        return dataframe

In [106]:
name_chars_tfidf = DataFrameTfidfVectorizer(col="name", 
                                            prefix="name_c",
                                            ngram_range=(3, 5), 
                                            analyzer="char",
                                            binary=True, #False
                                            min_df = 50) #8

name_words_tfidf = DataFrameTfidfVectorizer(col="name", 
                                            prefix="name_w", 
                                            token_pattern=r'\w+',
                                            ngram_range=(1, 2), 
                                            analyzer="word",
                                            binary=True, #False
                                            min_df = 10) #8

screen_name_tfidf = DataFrameTfidfVectorizer(col="screen_name", 
                                             ngram_range=(3, 5), 
                                             analyzer="char",
                                             binary=True, #False
                                             min_df = 50) #8

summary_tfidf = DataFrameTfidfVectorizer(col="summary",
                                         token_pattern=r'\w+',
                                         ngram_range=(1, 3), 
                                         analyzer="word",
                                         binary=True, #False
                                         sublinear_tf=True, 
                                         stop_words='english',
                                         min_df = 50) #5

## Further textual analysis

In [107]:
class TextToLowerCase(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X[c] = [t.lower() for t in X[c].values]
        return X

In [108]:
class NumberOfWords(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_words_in_"+c] = [len(t.split(' ')) for t in X[c].values]
        return X

In [109]:
class NumberNonAlphaNumChars(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_non_alphanum_in_"+c] = [len(re.sub(r"[\w\d]","", t)) for t in X[c].values]
        return X

In [110]:
class NumberUpperCaseChars(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_upper_case_chars_in_"+c] = [len(re.sub(r"[^A-Z]","", t)) for t in X[c].values]
        return X

In [111]:
class NumberCamelCaseWords(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_camel_case_words_in_"+c] = [len(re.findall(r"^[A-Z][a-z]|\s[A-Z][a-z]", t)) 
                                                         for t in X[c].values]
        return X

In [112]:
class NumberOfMentions(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_mentions_in_"+c] = [len(re.findall(r"\s@[a-zA-Z]",t)) 
                                                         for t in X[c].values]
        return X

In [113]:
class NumberOfPeriods(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_periods_in_"+c] = [len(t.split(". ")) 
                                                         for t in X[c].values]
        return X

In [114]:
class AvgWordsPerPeriod(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["avg_words_per_period_in_"+c] = [np.mean([len(p.split(" ")) for p in t.split(". ")]) 
                                                         for t in X[c].values]
        return X

In [115]:
class MentionToFamilyRelation(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self
    
    def count_mentions(self, t):
        count = 0
        tokenizer = nltk.RegexpTokenizer(r'[a-z]+')
        for tkn in tokenizer.tokenize(t):
              if tkn in ["husband","wife","father","mother","daddy","mommy",
                         "grandfather","grandmother","grandpa","grandma"]:
                    count += 1
        return count

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["mention_to_family_relation_in_"+c] = [self.count_mentions(t) 
                                                         for t in X[c].values]
        return X

## External Data

### Professional Occupations

Mentions to US occupations according to http://data.okfn.org/data/johnlsheridan/occupations

In [116]:
occupations = pd.read_csv("https://raw.githubusercontent.com/johnlsheridan/occupations/master/occupations.csv")
occupations.Occupations = [o.lower() for o in occupations.Occupations.values]
occupations.head()

Unnamed: 0,Occupations
0,accountant
1,accounts assistant
2,accounts clerk
3,accounts manager
4,accounts staff


In [117]:
class MentionToOccupation(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        occupations = pd.read_csv("https://raw.githubusercontent.com/johnlsheridan/occupations/master/occupations.csv")
        self.occupations_ = [o.lower() for o in occupations.Occupations.values]
        return self
    
    def count_mentions(self, t):
        count = 0
        for o in self.occupations_:
            count += len(re.findall(r"(^|\W)%s(\W|$)" % o, t))
            if count == 3:
                break
        return count

    def transform(self, X, y=None):
        check_is_fitted(self, 'occupations_')
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["mention_to_occupation_in_"+c] = [self.count_mentions(t) 
                                                     for t in X[c].values]
        return X
    
mention_to_occupation = MentionToOccupation(["summary"]).fit(train)

### Person Names

Person names according to http://deron.meranda.us/data/census-dist-female-first.txt and http://deron.meranda.us/data/census-dist-male-first.txt

In [118]:
female_names = pd.read_csv("http://deron.meranda.us/data/census-dist-female-first.txt", names=["name"])
male_names   = pd.read_csv("http://deron.meranda.us/data/census-dist-male-first.txt", names=["name"])

In [119]:
female_names.head()

Unnamed: 0,name
0,MARY 2.629 2.629 1
1,PATRICIA 1.073 3.702 2
2,LINDA 1.035 4.736 3
3,BARBARA 0.980 5.716 4
4,ELIZABETH 0.937 6.653 5


In [120]:
male_names.head()

Unnamed: 0,name
0,JAMES 3.318 3.318 1
1,JOHN 3.271 6.589 2
2,ROBERT 3.143 9.732 3
3,MICHAEL 2.629 12.361 4
4,WILLIAM 2.451 14.812 5


In [121]:
female_names = [re.sub(r"[^a-z]","",n.lower()) for n in female_names.name.values]
male_names   = [re.sub(r"[^a-z]","",n.lower()) for n in male_names.name.values]
person_names = list(set(male_names + female_names))
print len(female_names)
print len(male_names)
print len(person_names)

4275
1219
5163


In [122]:
class PersonNames(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        female_names = pd.read_csv("http://deron.meranda.us/data/census-dist-female-first.txt", names=["name"])
        male_names   = pd.read_csv("http://deron.meranda.us/data/census-dist-male-first.txt", names=["name"])
        female_names = [re.sub(r"[^a-z]","",n.lower()) for n in female_names.name.values]
        male_names   = [re.sub(r"[^a-z]","",n.lower()) for n in male_names.name.values]        
        self.person_names_ = list(set(male_names + female_names))
        return self
    
    def count_mentions(self, t):
        count = 0
        tokenizer = nltk.RegexpTokenizer(r'[a-z]+')
        for name in tokenizer.tokenize(t):
            if name in self.person_names_:
                count += 1
        return count

    def transform(self, X, y=None):
        check_is_fitted(self, 'person_names_')
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["person_names_in_"+c] = [self.count_mentions(t) 
                                            for t in X[c].values]
        return X
    
person_names = PersonNames(["name"]).fit(train)

## Drop Columns Transformer

In [123]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X.drop([c], axis=1, inplace=True)
        return X

## Final Imputer and np array transformer

In [124]:
class NumpyArrayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X = X.reindex_axis(sorted(X.columns), axis=1)
        X.fillna(0, inplace=True)
        return np.asarray(X)

## [Debugger for pipeline]

In [125]:
class Debugger(BaseEstimator, TransformerMixin):

    def __init__(self, name=""):
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print "-------------------------"
        print self.name
        print "Dataset dimensions:"
        print X.shape
        print "-------------------------"
        return X

# Training the model

In [126]:
outcome = "manual_segment"

features = list(set(train.columns) - set([outcome]))

## Cross Validation

In [127]:
# RandomForestClassifier - 0.69
# LogisticRegression - 0.57
# GaussianNB 0.45
# LinearSVC 0.5
# KNeighborsClassifier 0.58

n_estimators = 100

# Model Pipeline
pipeline = Pipeline([ ("drop_cols", DropColumnsTransformer(["segment","link"])),
                      ("verified", VerifiedTransformer()),
                      ("lang", lang_ohe),
                      ("fill_text_na", FillTextNA(["screen_name","name","summary"], "null")),
                      ("qt_words", NumberOfWords(["name","summary"])),
                      ("qt_non_alphanum_chars", NumberNonAlphaNumChars(["name","summary"])),
                      ("qt_upper_case_chars", NumberUpperCaseChars(["name","summary"])),
                      ("qt_camel_case_words", NumberCamelCaseWords(["name","summary"])),
                      ("qt_mentions", NumberOfMentions(["summary"])),
                      ("qt_periods", NumberOfPeriods(["summary"])),
                      ("avg_words_per_period", AvgWordsPerPeriod(["summary"])),
                      ("lower_case", TextToLowerCase(["screen_name","name","summary"])),
                      ("family", MentionToFamilyRelation(["summary"])),
                      ("person_names", person_names),
                      ("occupations", mention_to_occupation),
                      ("name_chars_tfidf", name_chars_tfidf),
                      ("name_words_tfidf", name_words_tfidf),
                      ("screen_name_tfidf", screen_name_tfidf),
                      ("summary_tfidf", summary_tfidf),
                      ("drop_text_cols", DropColumnsTransformer(["screen_name","name","summary"])),
                      ("debugger", Debugger("Dataset Details")),
                      ("nparray", NumpyArrayTransformer()),
                      ("model", RandomForestClassifier())])

In [128]:
k_fold = KFold(n=len(train), n_folds=4, shuffle=True)
b_scores, svc_scores = [], []

for tr_indices, cv_indices in k_fold:
    tr    = train.iloc[tr_indices,:].loc[:, features].copy()
    cv    = train.iloc[cv_indices,:].loc[:, features].copy()

    tr_y  = train.iloc[tr_indices,:][outcome].values
    cv_y  = train.iloc[cv_indices,:][outcome].values

    pipeline.fit(tr, tr_y)

    print(confusion_matrix(cv_y, pipeline.predict(cv)))    
    print('#### SCORE:' + str(pipeline.score(cv, cv_y)))

[[1784  608]
 [ 765 1905]]
#### SCORE:0.72876333465
[[1792  567]
 [ 860 1842]]
#### SCORE:0.718039913061
[[1773  579]
 [ 737 1972]]
#### SCORE:0.739972337483
[[1786  562]
 [ 820 1893]]
#### SCORE:0.726931436475


## Model Pipeline

In [129]:
pipeline.set_params(model__n_estimators = n_estimators)
pipeline.fit(train.loc[:,features], train.loc[:,outcome])

Pipeline(steps=[('drop_cols', DropColumnsTransformer(cols=['segment', 'link'])), ('verified', VerifiedTransformer()), ('lang', LangOneHotEncoding()), ('fill_text_na', FillTextNA(cols=['screen_name', 'name', 'summary'], replace_by='null')), ('qt_words', NumberOfWords(cols=['name', 'summary'])), ('qt_non_alpha...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Persist model pipeline

In [130]:
model_path = "data/manually_categorized/actor_classification_trained_model_20151129.pkl"
joblib.dump(pipeline, model_path, compress=9)

['data/manually_categorized/actor_classification_trained_model_20151129.pkl']

# Test

In [131]:
test_model = joblib.load(model_path)

## Loading data

In [137]:
test_data = ['{"name":"Светлана Петухова","screen_name":"svpetuhova26623","summary":"","lang":"ru","favourites_count":23,"statuses_count":14,"friends_count":2,"followers_count":1,"listed_count":1,"verified":0}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}',
             '{"lang":"en","summary":"Entrepreneur, enthusiast of Data Science!","verified":1,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}',
             '{"lang":"en","summary":"Bring your cute dog here and we will wash and feed him for you","verified":1,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"PetShop Molecão","screen_name":"petmolecao"}']
test_data = [json.loads(t) for t in test_data]
test_data = pd.DataFrame(test_data)

In [138]:
test_data

Unnamed: 0,favourites_count,followers_count,friends_count,lang,listed_count,name,screen_name,statuses_count,summary,verified
0,23,1,2,ru,1,Светлана Петухова,svpetuhova26623,14,,0
1,228,175,397,en,12,Daniel Adornes,daniel_adornes,410,"Artist, Writer, Designer. Tweets on tech, cult...",0
2,228,175,397,en,12,Daniel Adornes,daniel_adornes,410,"Entrepreneur, enthusiast of Data Science!",1
3,228,175,397,en,12,PetShop Molecão,petmolecao,410,Bring your cute dog here and we will wash and ...,1


## Predict

In [139]:
result = test_model.predict_proba(test_data)

In [140]:
pd.DataFrame(result, columns=["business","person"])

Unnamed: 0,business,person
0,0.48,0.52
1,0.18,0.82
2,0.2,0.8
3,0.62,0.38
