In [None]:
import os
import re
import fnmatch
import json

import numpy as np
import pandas as pd
import chardet
import gc
import matplotlib.pyplot as plt

from pprint import pprint
from time import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.lda import LDA
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Loading data

In [None]:
input_dir = 'data/manually_categorized/'
input_prefix = 'actor_classification_train'
train = None
for file in os.listdir(input_dir):
  if fnmatch.fnmatch(file, input_prefix+'*.csv'):
    if train is None:
      print "==> Initializing input dataframe: "
      train = pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)
    else:
      print "==> Concatenating dataframe from " + file + ": "
      train = pd.concat([train, pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)])
    train.drop_duplicates(inplace=True)
    print train.shape

In [None]:
# train = train.sample(1000)
train.head()

In [None]:
train.tail()

# Feature Engineering

## Custom Transformer with LabelEncoder for 'verified'

In [None]:
class VerifiedTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X.verified.fillna(False, inplace=True)
        X.verified = LabelEncoder().fit_transform(X.verified)
        return X

In [None]:
verified_transformer = VerifiedTransformer()
verified_transformer.transform(train)

print train.verified.value_counts()

## Custom OneHotEncoding for lang

In [None]:
class LangOneHotEncoding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        valid_langs = list(set(X.lang) - set([None, np.nan, 'Select Language...']))
        self.feature_names_ = ["lang_"+str(l) for l in valid_langs if type(l) == str]
        return self

    def transform(self, X, y=None):
        check_is_fitted(self, 'feature_names_')
        
        X = X.copy()
        X["lang"].fillna("", inplace=True)
        for lang_feature in self.feature_names_:
            X[lang_feature] = [(1 if lang_feature == "lang_"+v else 0) for v in X["lang"].values]
        
        X.drop(["lang"], axis=1, inplace=True)
        return X
    
lang_ohe = LangOneHotEncoding().fit(train)

## Replacing NAs on textual fields

In [None]:
class FillTextNA(BaseEstimator, TransformerMixin):

    def __init__(self, cols, replace_by=""):
        self.cols = cols
        self.replace_by = replace_by

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X[c].fillna(self.replace_by, inplace=True)
        return X

## DataFrameTfidfVectorizer for textual fields

In [None]:
class DataFrameTfidfVectorizer(TfidfVectorizer):

    def __init__(self, col, prefix=None, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):
        super(DataFrameTfidfVectorizer, self).__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self.col = col
        self.prefix = prefix or col
        
    def treat_special_char(self, c):
        try:
            encoding = chardet.detect(str(c))['encoding'] or "KOI8-R"
            return '0' if c.isdigit() else c.decode(encoding)
        except:        
            return '9'

    def treat_special_chars(self, col):
        col.fillna("null", inplace=True)
        col = [''.join([self.treat_special_char(c) for c in list(n)]) 
               for n in col.values]
        return col

    def fit(self, dataframe, y=None):
        dataframe = dataframe.copy()
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        super(DataFrameTfidfVectorizer, self).fit(dataframe[self.col])
        return self

    def fit_transform(self, dataframe, y=None):
        dataframe = dataframe.copy()
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        field_matrix = super(DataFrameTfidfVectorizer, self).fit_transform(dataframe[self.col])
        features_names = map(lambda f: "_".join([self.prefix,f]), super(DataFrameTfidfVectorizer, self).get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)

        dataframe = dataframe.join(field_df)

        return dataframe

    def transform(self, dataframe, copy=True):
        dataframe = dataframe.copy()
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        field_matrix = super(DataFrameTfidfVectorizer, self).transform(dataframe[self.col])
        features_names = map(lambda f: "_".join([self.prefix,f]), super(DataFrameTfidfVectorizer, self).get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)

        dataframe = dataframe.join(field_df)

        return dataframe

In [None]:
name_chars_tfidf = DataFrameTfidfVectorizer(col="name", 
                                            prefix="name_c",
                                            ngram_range=(3, 5), 
                                            analyzer="char",
                                            binary=True, #False
                                            min_df = 50) #8

name_words_tfidf = DataFrameTfidfVectorizer(col="name", 
                                            prefix="name_w", 
                                            token_pattern=r'\w+',
                                            ngram_range=(1, 2), 
                                            analyzer="word",
                                            binary=True, #False
                                            min_df = 10) #8

screen_name_tfidf = DataFrameTfidfVectorizer(col="screen_name", 
                                             ngram_range=(3, 5), 
                                             analyzer="char",
                                             binary=True, #False
                                             min_df = 50) #8

summary_tfidf = DataFrameTfidfVectorizer(col="summary",
                                         token_pattern=r'\w+',
                                         ngram_range=(1, 3), 
                                         analyzer="word",
                                         binary=True, #False
                                         sublinear_tf=True, 
                                         stop_words='english',
                                         min_df = 50) #5

In [None]:
# name_chars_tfidf.fit(train)
# name_words_tfidf.fit(train)
# screen_name_tfidf.fit(train)
# summary_tfidf.fit(train)

## Further textual analysis

In [None]:
class TextToLowerCase(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X[c] = [t.lower() for t in X[c].values]
        return X

In [None]:
class NumberOfWords(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_words_in_"+c] = [len(t.split(' ')) for t in X[c].values]
        return X

In [None]:
class NumberNonAlphaNumChars(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_non_alphanum_in_"+c] = [len(re.sub(r"[\w\d]","", t)) for t in X[c].values]
        return X

In [None]:
class NumberUpperCaseChars(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_upper_case_chars_in_"+c] = [len(re.sub(r"[^A-Z]","", t)) for t in X[c].values]
        return X

In [None]:
class NumberCamelCaseWords(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_camel_case_words_in_"+c] = [len(re.findall(r"^[A-Z][a-z]|\s[A-Z][a-z]", t)) 
                                                         for t in X[c].values]
        return X

In [None]:
class NumberOfMentions(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_mentions_in_"+c] = [len(re.findall(r"\s@[a-zA-Z]",t)) 
                                                         for t in X[c].values]
        return X

In [None]:
class NumberOfPeriods(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["number_of_periods_in_"+c] = [len(t.split(". ")) 
                                                         for t in X[c].values]
        return X

In [None]:
class AvgWordsPerPeriod(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["avg_words_per_period_in_"+c] = [np.mean([len(p.split(" ")) for p in t.split(". ")]) 
                                                         for t in X[c].values]
        return X

In [None]:
class MentionToFamilyRelation(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self
    
    def count_mentions(self, t):
        count = 0
        for o in ["husband","wife","father","mother","daddy","mommy",
                  "grandfather","grandmother","grandpa","grandma"]:
            count += len(re.findall(r"(^|\W)%s(\W|$)" % o, t))
        return count

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["mention_to_family_relation_in_"+c] = [self.count_mentions(t) 
                                                         for t in X[c].values]
        return X

## External Data

### Professional Occupations

Mentions to US occupations according to http://data.okfn.org/data/johnlsheridan/occupations

In [None]:
occupations = pd.read_csv("https://raw.githubusercontent.com/johnlsheridan/occupations/master/occupations.csv")
occupations.Occupations = [o.lower() for o in occupations.Occupations.values]
occupations.head()

In [None]:
class MentionToOccupation(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        occupations = pd.read_csv("https://raw.githubusercontent.com/johnlsheridan/occupations/master/occupations.csv")
        self.occupations_ = [o.lower() for o in occupations.Occupations.values]
        return self
    
    def count_mentions(self, t):
        count = 0
        for o in self.occupations_:
            count += len(re.findall(r"(^|\W)%s(\W|$)" % o, t))
        return count

    def transform(self, X, y=None):
        check_is_fitted(self, 'occupations_')
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["mention_to_occupation_in_"+c] = [self.count_mentions(t) 
                                                     for t in X[c].values]
        return X
    
mention_to_occupation = MentionToOccupation(["summary"]).fit(train)

### Person Names

Person names according to http://deron.meranda.us/data/census-dist-female-first.txt and http://deron.meranda.us/data/census-dist-male-first.txt

In [None]:
female_names = pd.read_csv("http://deron.meranda.us/data/census-dist-female-first.txt", names=["name"])
male_names   = pd.read_csv("http://deron.meranda.us/data/census-dist-male-first.txt", names=["name"])

In [None]:
female_names.head()

In [None]:
male_names.head()

In [None]:
female_names = [re.sub(r"[^a-z]","",n.lower()) for n in female_names.name.values]
male_names   = [re.sub(r"[^a-z]","",n.lower()) for n in male_names.name.values]
person_names = list(set(male_names + female_names))
print len(female_names)
print len(male_names)
print len(person_names)

In [None]:
class PersonNames(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        female_names = pd.read_csv("http://deron.meranda.us/data/census-dist-female-first.txt", names=["name"])
        male_names   = pd.read_csv("http://deron.meranda.us/data/census-dist-male-first.txt", names=["name"])
        female_names = [re.sub(r"[^a-z]","",n.lower()) for n in female_names.name.values]
        male_names   = [re.sub(r"[^a-z]","",n.lower()) for n in male_names.name.values]        
        self.person_names_ = list(set(male_names + female_names))
        return self
    
    def count_mentions(self, t):
        count = 0
        for n in self.person_names_:
            count += len(re.findall(r"(^|\W)%s(\W|$)" % n, t))
        return count

    def transform(self, X, y=None):
        check_is_fitted(self, 'person_names_')
        X = X.copy()
        for c in self.cols:
            if c in X:
                X["person_names_in_"+c] = [self.count_mentions(t) 
                                            for t in X[c].values]
        return X
    
person_names = PersonNames(["name"]).fit(train)

## Drop Columns Transformer

In [None]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for c in self.cols:
            if c in X:
                X.drop([c], axis=1, inplace=True)
        return X

## Final Imputer and np array transformer

In [None]:
class NumpyArrayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X = X.reindex_axis(sorted(X.columns), axis=1)
        X.fillna(0, inplace=True)
        return np.asarray(X)

## [Debugger for pipeline]

In [None]:
class Debugger(BaseEstimator, TransformerMixin):

    def __init__(self, name=""):
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print "-------------------------"
        print type(X)
        print "-------------------------"
        return X

# Training the model

In [None]:
outcome = "manual_segment"

features = list(set(train.columns) - set([outcome]))

## GridSearchCV

In [None]:
# RandomForestClassifier - 0.69
# LogisticRegression - 0.57
# GaussianNB 0.45
# LinearSVC 0.5
# KNeighborsClassifier 0.58

n_estimators = 100

# Model Pipeline
pipeline = Pipeline([ ("drop_cols", DropColumnsTransformer(["segment","link"])),
                      ("verified", VerifiedTransformer()),
                      ("lang", lang_ohe),
                      ("fill_text_na", FillTextNA(["screen_name","name","summary"], "null")),
                      ("qt_words", NumberOfWords(["name","summary"])),
                      ("qt_non_alphanum_chars", NumberNonAlphaNumChars(["name","summary"])),
                      ("qt_upper_case_chars", NumberUpperCaseChars(["name","summary"])),
                      ("qt_camel_case_words", NumberCamelCaseWords(["name","summary"])),
                      ("qt_mentions", NumberOfMentions(["summary"])),
                      ("qt_periods", NumberOfPeriods(["summary"])),
                      ("avg_words_per_period", AvgWordsPerPeriod(["summary"])),
                      ("lower_case", TextToLowerCase(["screen_name","name","summary"])),
                      ("family", MentionToFamilyRelation(["summary"])),
                      ("person_names", person_names),
                      ("occupations", mention_to_occupation),
                      ("name_chars_tfidf", name_chars_tfidf),
                      ("name_words_tfidf", name_words_tfidf),
                      ("screen_name_tfidf", screen_name_tfidf),
                      ("summary_tfidf", summary_tfidf),
                      ("drop_text_cols", DropColumnsTransformer(["screen_name","name","summary"])),
                      ("nparray", NumpyArrayTransformer()),
                      ("model", RandomForestClassifier())])

# GridSearchCV params
parameters = [
    {"name_chars_tfidf__analyzer": ["char", "char_wb"]},
    {"name_chars_tfidf__min_df": [10, 30, 50, 80]},
    {"name_words_tfidf__min_df": [10, 30, 50, 80]},
    {"screen_name_tfidf__min_df": [10, 30, 50, 80]},
    {"summary_tfidf__min_df": [10, 30, 50, 80]},
    {"model__n_estimators": [n_estimators]},
]

# GridSearchCV setup
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, error_score=0)

In [None]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train.loc[:,features], train.loc[:,outcome])
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()

for param in parameters:
    for param_name in param.keys():
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

## Model Pipeline

In [None]:
pipeline.set_params(**best_parameters)
pipeline.fit(train.loc[:,features], train.loc[:,outcome])

## Persist model pipeline

In [None]:
model_path = "data/manually_categorized/actor_classification_trained_model_20151129.pkl"
joblib.dump(pipeline, model_path, compress=9)

# Test

In [None]:
test_model = joblib.load(model_path)

## Loading data

In [None]:
test_data = ['{"name":"Светлана Петухова","screen_name":"svpetuhova26623","summary":"","lang":"ru","favourites_count":23,"statuses_count":14,"friends_count":2,"followers_count":1,"listed_count":1,"verified":0}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}']
test_data = [json.loads(t) for t in test_data]
test_data = pd.DataFrame(test_data)

In [None]:
test_data

## Predict

In [None]:
result = test_model.predict_proba(test_data)

In [None]:
pd.DataFrame(result, columns=["business","person"])