In [22]:
import os
import fnmatch
import json

import numpy as np
import pandas as pd
import chardet
import gc
import matplotlib.pyplot as plt

from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.lda import LDA
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Loading data

In [23]:
input_dir = 'data/manually_categorized/'
input_prefix = 'actor_classification_train'
train = None
for file in os.listdir(input_dir):
  if fnmatch.fnmatch(file, input_prefix+'*.csv'):
    if train is None:
      print "==> Initializing input dataframe: "
      train = pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)
    else:
      print "==> Concatenating dataframe from " + file + ": "
      train = pd.concat([train, pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)])
    train.drop_duplicates(inplace=True)
    print train.shape

==> Initializing input dataframe: 
(20245, 13)
==> Concatenating dataframe from actor_classification_train_copy.csv: 
(20245, 13)


In [24]:
train = train.sample(1000)
train.head()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
5036,仆���Peace God 仆��� Payne,Rudeboynookie,en,1314,56792,34235,#Artist #Father #artist #Promoter #PeaceGod #N...,55061,http://www.twitter.com/Rudeboynookie,82,False,person,0
20168,Mansoor Ali Khan,_Mans00r,en,9238,2224,980,"Senior Anchor/Journalist/Blogger, ICFJ fellow,...",234322,http://www.twitter.com/_Mans00r,562,False,person,1
9009,Marco Staff,MarcoStaff,en,860,6362,15436,Owner & Creator of Expensiv Lifestyle.,40736,http://www.twitter.com/MarcoStaff,110,False,business,0
16957,Christine Gilbert,cb_gilbert,en,768,12027,842,"Writer, Photographer & Filmmaker. Wife of the ...",45229,http://www.twitter.com/cb_gilbert,2070,False,person,1
2777,Andrea Lowell,TheAndreaLowell,en,40,5198,447,"TV & Radio Host, Model, and Raw Superfood Enth...",41204,http://www.twitter.com/TheAndreaLowell,432,False,business,0


In [25]:
train.tail()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
2837,Sean Melvin,ThatBlokeSean,en,5081,60040,162300,"Financial Systems Consultant, Business Start U...",163176,http://www.twitter.com/ThatBlokeSean,1350,False,person,1
8218,�ʙ�����,mo__la,ar,295,81724,161669,���� �����_�� �_���� �����_��,159778,http://www.twitter.com/mo__la,212,False,business,0
9823,LeBron James �_,LeBronJames,en,1135,1115,806213,��� #StriveForGreatness��_����_ #TeamLeBron pa...,3096517,http://www.twitter.com/LeBronJames,876,False,person,1
2755,Exotic cars for sale,theBestOfCars,en,20,254,95,A secret exotic car community,35066,http://www.twitter.com/theBestOfCars,96,False,business,0
16518,IG: ClayPerryMusic,ClayPerryMusic,en,20579,61698,27604,Recording Artist | Assistant: @MikeJQuintero |...,61973,http://www.twitter.com/ClayPerryMusic,93,False,person,1


# Feature Engineering

## Custom Transformer with LabelEncoder for 'verified'

In [26]:
class VerifiedTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.verified.fillna(False, inplace=True)
        X.verified = LabelEncoder().fit_transform(X.verified)
        return X

In [27]:
verified_transformer = VerifiedTransformer()
verified_transformer.transform(train)

print train.verified.value_counts()

0    849
1    151
dtype: int64


## Custom OneHotEncoding for lang

In [28]:
class LangOneHotEncoding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        valid_langs = list(set(X.lang) - set([None, np.nan, 'Select Language...']))
        self.feature_names_ = ["lang_"+l for l in valid_langs]
        return self

    def transform(self, X, y=None):
        check_is_fitted(self, 'feature_names_')
        
        X["lang"].fillna("", inplace=True)
        for lang_feature in self.feature_names_:
            X[lang_feature] = [(1 if lang_feature == "lang_"+v else 0) for v in X["lang"].values]
        
        X.drop(["lang"], axis=1, inplace=True)
        return X
    
lang_ohe = LangOneHotEncoding().fit(train)

## Special characters Transformation [DELETE AFTER EVERYTHING WORKS]

In [29]:
# class SpecialCharactersTransformer(TransformerMixin):

#     def __init__(self, text_fields):
#         self.text_fields = text_fields

#     def fit(self, X, y=None):
#         return self

#     def treat_special_char(self, c):
#         try:
#             encoding = chardet.detect(str(c))['encoding'] or "KOI8-R"
#             return '0' if c.isdigit() else c.decode(encoding)
#         except UnicodeDecodeError:        
#             return '9'

#     def transform(self, X, y=None):
#         for field in self.text_fields:
# #             X.ix[X[field].isnull(), field] = "null"
# #             X[field] = map(lambda n: ''.join(map(lambda c: self.treat_special_char(c), list(n))), X[field].values)
#             X[field].fillna("null", inplace=True)
#             X[field] = [''.join([self.treat_special_char(c) for c in list(n)]) for n in X[field].values]

#         return X

## DataFrameTfidfVectorizer

In [30]:
class DataFrameTfidfVectorizer(TfidfVectorizer):

    def __init__(self, col, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):
        super(DataFrameTfidfVectorizer, self).__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self.col = col
        
    def treat_special_char(self, c):
        try:
            encoding = chardet.detect(str(c))['encoding'] or "KOI8-R"
            return '0' if c.isdigit() else c.decode(encoding)
        except:        
            return '9'

    def treat_special_chars(self, col):
        col.fillna("null", inplace=True)
        col = [''.join([self.treat_special_char(c) for c in list(n)]) 
               for n in col.values]
        return col

    def fit(self, dataframe, y=None):
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        super(DataFrameTfidfVectorizer, self).fit(dataframe[self.col])
        return self

    def fit_transform(self, dataframe, y=None):
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        field_matrix = super(DataFrameTfidfVectorizer, self).fit_transform(dataframe[self.col])
        features_names = map(lambda f: "_".join([self.col,f]), super(DataFrameTfidfVectorizer, self).get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)

        dataframe = dataframe.join(field_df)
        dataframe.drop([self.col], axis=1, inplace=True)

        return dataframe

    def transform(self, dataframe, copy=True):
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        field_matrix = super(DataFrameTfidfVectorizer, self).transform(dataframe[self.col])
        features_names = map(lambda f: "_".join([self.col,f]), super(DataFrameTfidfVectorizer, self).get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)

        dataframe = dataframe.join(field_df)
        dataframe.drop([self.col], axis=1, inplace=True)

        return dataframe

## DataFrameTfidfVectorizer for textual fields

In [31]:
name_tfidf = DataFrameTfidfVectorizer(col="name", 
                                      ngram_range=(3, 5), 
                                      analyzer="char",
                                      binary=True, #False
                                      min_df = 5) #8

screen_name_tfidf = DataFrameTfidfVectorizer(col="screen_name", 
                                             ngram_range=(3, 5), 
                                             analyzer="char",
                                             binary=True, #False
                                             min_df = 5) #8

summary_tfidf = DataFrameTfidfVectorizer(col="summary",
                                         token_pattern=r'\w+',
                                         ngram_range=(1, 3), 
                                         analyzer="word",
                                         binary=True, #False
                                         sublinear_tf=True, 
                                         stop_words='english',
                                         min_df = 5) #5

In [32]:
name_tfidf.fit(train)
screen_name_tfidf.fit(train)
summary_tfidf.fit(train)

DataFrameTfidfVectorizer(analyzer='word', binary=True, col='summary',
             decode_error='strict', dtype=<type 'numpy.int64'>,
             encoding='utf-8', input='content', lowercase=True, max_df=1.0,
             max_features=None, min_df=5, ngram_range=(1, 3), norm=u'l2',
             preprocessor=None, smooth_idf=True, stop_words='english',
             strip_accents=None, sublinear_tf=False, token_pattern='\\w+',
             tokenizer=None, use_idf=True, vocabulary=None)

## Drop Columns Transformer

In [33]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for c in self.cols:
            if c in X:
                X.drop([c], axis=1, inplace=True)
        return X

## Final Imputer and np array transformer

In [34]:
class NumpyArrayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.fillna(0, inplace=True)
        return np.asarray(X)

### [Debugger for pipeline]

In [42]:
class Debugger(BaseEstimator, TransformerMixin):

    def __init__(self, name=""):
        self.name = name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print "-------------------------"
        print type(X)
        print "-------------------------"
        return X

# Training the model

In [36]:
outcome = "manual_segment"

features = list(set(train.columns) - set([outcome]))

## KFold cross validation

In [44]:
doCV = True

if doCV:
    # KFold cross validation setup
    k_fold = KFold(n=len(train), n_folds=2, shuffle=True)
    b_scores, svc_scores = [], []

    n_estimators = 10

    for tr_indices, cv_indices in k_fold:
        
        print k_fold
        
        tr    = train.iloc[tr_indices,:].loc[:, features].copy()
        cv    = train.iloc[cv_indices,:].loc[:, features].copy()
        
        tr_y  = np.asarray(tr_cv.loc[tr_indices][outcome].values)
        cv_y  = np.asarray(tr_cv.loc[cv_indices][outcome].values)

        # Model Pipeline
        model = Pipeline([("drop_cols", DropColumnsTransformer(["segment","link"])),
                          ("debugger", Debugger()),
                          ("verified", VerifiedTransformer()),
                          ("lang", lang_ohe),
                          ("name_tfidf", name_tfidf),
                          ("screen_name_tfidf", screen_name_tfidf),
                          ("summary_tfidf", summary_tfidf),
                          ("nparray", NumpyArrayTransformer()),
                          ("scaler", StandardScaler()),
                          ("rf", RandomForestClassifier())])
        model.set_params(rf__n_estimators = n_estimators)

        model.fit(tr, tr_y)

        # Validate
        cv = model.transform(cv)
        print(confusion_matrix(cv_y, model.predict(cv)))    
        print('score:' + str(model.score(cv, cv_y)))


sklearn.cross_validation.KFold(n=1000, n_folds=2, shuffle=True, random_state=None)
-------------------------
<class 'pandas.core.frame.DataFrame'>
-------------------------
-------------------------
<class 'pandas.core.frame.DataFrame'>
-------------------------
-------------------------
<type 'numpy.ndarray'>
-------------------------


AttributeError: 'numpy.ndarray' object has no attribute 'verified'

## Model Pipeline

In [46]:
model = Pipeline([("drop_cols", DropColumnsTransformer(["segment","link"])),
                  ("verified", VerifiedTransformer()),
                  ("lang", lang_ohe),
                  ("name_tfidf", name_tfidf),
                  ("screen_name_tfidf", screen_name_tfidf),
                  ("summary_tfidf", summary_tfidf),
                  ("nparray", NumpyArrayTransformer()),
                  ("scaler", StandardScaler()),
                  ("rf", RandomForestClassifier())])
model.set_params(rf__n_estimators = n_estimators)
model.fit(train.loc[:,features], train.loc[:,outcome])

Pipeline(steps=[('drop_cols', DropColumnsTransformer(cols=['segment', 'link'])), ('verified', VerifiedTransformer()), ('lang', LangOneHotEncoding()), ('name_tfidf', DataFrameTfidfVectorizer(analyzer='char', binary=True, col='name',
             decode_error='strict', dtype=<type 'numpy.int64'>,
             ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## Persist model pipeline

In [47]:
model_path = "data/manually_categorized/actor_classification_random_forest_20151129.pkl"
joblib.dump(model, model_path, compress=9)

['data/manually_categorized/actor_classification_random_forest_20151129.pkl']

# Test

In [48]:
test_model = joblib.load(model_path)

## Loading data

In [49]:
test_data = ['{"name":"Светлана Петухова","screen_name":"svpetuhova26623","summary":"","lang":"ru","favourites_count":23,"statuses_count":14,"friends_count":2,"followers_count":1,"listed_count":1,"verified":0}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}']
test_data = [json.loads(t) for t in test_data]
test_data = pd.DataFrame(test_data)

In [50]:
test_data

Unnamed: 0,favourites_count,followers_count,friends_count,lang,listed_count,name,screen_name,statuses_count,summary,verified
0,23,1,2,ru,1,Светлана Петухова,svpetuhova26623,14,,0
1,228,175,397,en,12,Daniel Adornes,daniel_adornes,410,"Artist, Writer, Designer. Tweets on tech, cult...",0
2,228,175,397,en,12,Daniel Adornes,daniel_adornes,410,"Artist, Writer, Designer. Tweets on tech, cult...",0


## Predict

In [51]:
result = test_model.predict_proba(test_data)

In [52]:
pd.DataFrame(result, columns=["business","person"])

Unnamed: 0,business,person
0,0.5,0.5
1,0.3,0.7
2,0.3,0.7
