In [19]:
import os
import fnmatch
import json

import numpy as np
import pandas as pd
import chardet
import gc
import matplotlib.pyplot as plt

from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.lda import LDA
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Loading data

In [20]:
input_dir = 'data/manually_categorized/'
input_prefix = 'actor_classification_train'
train = None
for file in os.listdir(input_dir):
  if fnmatch.fnmatch(file, input_prefix+'*.csv'):
    if train is None:
      print "==> Initializing input dataframe: "
      train = pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)
    else:
      print "==> Concatenating dataframe from " + file + ": "
      train = pd.concat([train, pd.read_csv(open(input_dir+file,'rU'),
                          engine='python', sep=",", quoting=1)])
    train.drop_duplicates(inplace=True)
    print train.shape

==> Initializing input dataframe: 
(20245, 13)
==> Concatenating dataframe from actor_classification_train_copy.csv: 
(20245, 13)


In [21]:
train.head()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
0,Guy,ZZ0,en,394,14626,122072,"Martial arts, contortion, 7-string elec violin...",122030,http://www.twitter.com/ZZ0,745.0,False,person,0
1,party here,zxynisgod,es,75357,169818,44087,���I hate One Direction.�� -people who have lo...,72756,http://www.twitter.com/zxynisgod,327.0,False,person,1
2,?��,Zxntio,en,24372,38662,118,@rantzantio,119602,http://www.twitter.com/Zxntio,5.0,False,business,1
3,�_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_܃_,zxkia,en-gb,9874,119158,127928,Don't take me seriously. || Turn off rts & tur...,197890,http://www.twitter.com/zxkia,170.0,False,person,0
4,,Zxkia,en,94,5514,12563,Somewhere between I want it and I got it. ~ Pr...,24316,http://twitter.com/Zxkia,,,,0


In [22]:
train.tail()

Unnamed: 0,name,screen_name,lang,favourites_count,statuses_count,friends_count,summary,followers_count,link,listed_count,verified,segment,manual_segment
20246,DOSE,___Dose___,en,8,20194,12,"A Strong Dose of Amazing People, Places, and T...",421003,http://www.twitter.com/___Dose___,2949,False,person,0
20247,One Direction News,_______1d_4ever,en,0,18578,37552,All the latest One Direction news from around ...,46205,http://www.twitter.com/_______1d_4ever,46,False,person,0
20248,Tyrne Clark,10223335,en,467,13481,60692,Shouldn't a strange and wonderful world be ful...,58620,http://www.twitter.com/10223335,54,False,person,1
20249,1776,1776,en,6586,9746,1293,Global incubator & seed fund helping startups ...,87459,http://www.twitter.com/1776,1125,False,person,0
20250,350 dot org,350,en,1153,25876,19502,Join a global movement that's inspiring the wo...,266424,http://www.twitter.com/350,5870,True,person,0


# Feature Engineering

## Custom Transformer with LabelEncoder for 'verified'

In [23]:
class VerifiedTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.verified.fillna(False, inplace=True)
        X.verified = LabelEncoder().fit_transform(X.verified)
        return X

In [24]:
verified_transformer = VerifiedTransformer()
verified_transformer.transform(train)

print train.verified.value_counts()

0    17337
1     2908
dtype: int64


## Custom OneHotEncoding for lang

In [25]:
class LangOneHotEncoding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        valid_langs = list(set(X.lang) - set([None, np.nan, 'Select Language...']))
        self.feature_names_ = ["lang_"+l for l in valid_langs]
        return self

    def transform(self, X, y=None):
        check_is_fitted(self, 'feature_names_')
        
        X["lang"].fillna("", inplace=True)
        for lang_feature in self.feature_names_:
            X[lang_feature] = [(1 if lang_feature == "lang_"+v else 0) for v in X["lang"].values]
        
        X.drop(["lang"], axis=1, inplace=True)
        return X
    
lang_ohe = LangOneHotEncoding().fit(train)

## Special characters Transformation

In [26]:
# class SpecialCharactersTransformer(TransformerMixin):

#     def __init__(self, text_fields):
#         self.text_fields = text_fields

#     def fit(self, X, y=None):
#         return self

#     def treat_special_char(self, c):
#         try:
#             encoding = chardet.detect(str(c))['encoding'] or "KOI8-R"
#             return '0' if c.isdigit() else c.decode(encoding)
#         except UnicodeDecodeError:        
#             return '9'

#     def transform(self, X, y=None):
#         for field in self.text_fields:
# #             X.ix[X[field].isnull(), field] = "null"
# #             X[field] = map(lambda n: ''.join(map(lambda c: self.treat_special_char(c), list(n))), X[field].values)
#             X[field].fillna("null", inplace=True)
#             X[field] = [''.join([self.treat_special_char(c) for c in list(n)]) for n in X[field].values]

#         return X

## DataFrameTfidfVectorizer

In [27]:
class DataFrameTfidfVectorizer(TfidfVectorizer):

    def __init__(self, col, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):
        super(DataFrameTfidfVectorizer, self).__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self.col = col
        
    def treat_special_char(self, c):
        try:
            encoding = chardet.detect(str(c))['encoding'] or "KOI8-R"
            return '0' if c.isdigit() else c.decode(encoding)
        except:        
            return '9'

    def treat_special_chars(self, col):
        col.fillna("null", inplace=True)
        col = [''.join([self.treat_special_char(c) for c in list(n)]) 
               for n in col.values]
        return col

    def fit(self, dataframe, y=None):
        dataframe[self.col] = self.treat_special_chars(dataframe[self.col])
        super(DataFrameTfidfVectorizer, self).fit(dataframe[self.col])
        return self

    def fit_transform(self, dataframe, y=None):
        dataframe = self.treat_special_chars(dataframe)
        field_matrix = super(DataFrameTfidfVectorizer, self).fit_transform(dataframe[self.col])
        features_names = map(lambda f: "_".join([self.col,f]), super(DataFrameTfidfVectorizer, self).get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)

        dataframe = pd.concat([dataframe, field_df], axis=1, join='inner')
        dataframe.drop([self.col], axis=1, inplace=True)

        return dataframe

    def transform(self, dataframe, copy=True):
        dataframe = self.treat_special_chars(dataframe)
        field_matrix = super(DataFrameTfidfVectorizer, self).transform(dataframe[self.col])
        features_names = map(lambda f: "_".join([self.col,f]), super(DataFrameTfidfVectorizer, self).get_feature_names())
        field_df = pd.DataFrame(field_matrix.A, columns=features_names)

        dataframe = pd.concat([dataframe, field_df], axis=1, join='inner')
        dataframe.drop([self.col], axis=1, inplace=True)

        return dataframe

## DataFrameTfidfVectorizer for textual fields

In [28]:
name_tfidf = DataFrameTfidfVectorizer(col="name", 
                                      ngram_range=(3, 5), 
                                      analyzer="char",
                                      binary=True, #False
                                      min_df = 50) #8

screen_name_tfidf = DataFrameTfidfVectorizer(col="screen_name", 
                                             ngram_range=(3, 5), 
                                             analyzer="char",
                                             binary=True, #False
                                             min_df = 50) #8

summary_tfidf = DataFrameTfidfVectorizer(col="summary",
                                         token_pattern=r'\w+',
                                         ngram_range=(1, 3), 
                                         analyzer="word",
                                         binary=True, #False
                                         sublinear_tf=True, 
                                         stop_words='english',
                                         min_df = 50) #5

In [29]:
name_tfidf.fit(train)
screen_name_tfidf.fit(train)
summary_tfidf.fit(train)

DataFrameTfidfVectorizer(analyzer='word', binary=True, col='summary',
             decode_error='strict', dtype=<type 'numpy.int64'>,
             encoding='utf-8', input='content', lowercase=True, max_df=1.0,
             max_features=None, min_df=50, ngram_range=(1, 4), norm=u'l2',
             preprocessor=None, smooth_idf=True, stop_words='english',
             strip_accents=None, sublinear_tf=False, token_pattern='\\w+',
             tokenizer=None, use_idf=True, vocabulary=None)

## Drop Columns Transformer

In [30]:
class DropColumnsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for c in self.cols:
            if c in X:
                X.drop([c], axis=1, inplace=True)
        return X

## Final Imputer and np array transformer

In [31]:
class NumpyArrayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.fillna(0, inplace=True)
        return np.asarray(X)

## Persist Enriched Data

In [None]:
# train.to_csv('data/manually_categorized/enriched-actor_classification_train.csv', index=False, encoding="utf-8")
# joblib.dump(train.columns, 'data/manually_categorized/actor_classification_random_forest_features_20151124.csv', compress=9)

# Feature selection

In [32]:
outcome = "manual_segment"

features = list(set(train.columns) - set([outcome]))

# Training the model

## KFold cross validation

In [33]:
# KFold cross validation setup
k_fold = KFold(n=len(train), n_folds=4, indices=False, shuffle=True)
b_scores, svc_scores = [], []

n_estimators = 10

for tr_indices, cv_indices in k_fold:
    tr   = train[tr_indices][features]
    cv   = train[cv_indices][features]

    tr_y = np.asarray(train[tr_indices][outcome])
    cv_y = np.asarray(train[cv_indices][outcome])
    
    # Model Pipeline
    model = Pipeline([("drop_cols", DropColumnsTransformer(["segment","link"])),
                      ("verified", VerifiedTransformer()),
                      ("lang", lang_ohe),
                      ("name_tfidf", name_tfidf),
                      ("screen_name_tfidf", screen_name_tfidf),
                      ("summary_tfidf", summary_tfidf),
                      ("nparray", NumpyArrayTransformer()),
                      ("scaler", StandardScaler()),
                      ("rf", RandomForestClassifier())])
    model.set_params(rf__n_estimators = n_estimators)

    model.fit(tr, tr_y)

    # Validate
    cv = model.transform(cv)
    print(confusion_matrix(cv_y, model.predict(cv)))    
    print('score:' + str(model.score(cv, cv_y)))
    

  stacklevel=1)


TypeError: list indices must be integers, not str

## Model Pipeline

In [34]:
model = Pipeline([("drop_cols", DropColumnsTransformer(["segment","link"])),
                  ("verified", VerifiedTransformer()),
                  ("lang", lang_ohe),
                  ("name_tfidf", name_tfidf),
                  ("screen_name_tfidf", screen_name_tfidf),
                  ("summary_tfidf", summary_tfidf),
                  ("nparray", NumpyArrayTransformer()),
                  ("scaler", StandardScaler()),
                  ("rf", RandomForestClassifier())])
model.set_params(rf__n_estimators = n_estimators)
model.fit(train[features], train[outcome])

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


TypeError: list indices must be integers, not str

## Persist model pipeline

In [None]:
model_path = "data/manually_categorized/actor_classification_random_forest_20151129.pkl"
joblib.dump(model, model_path, compress=9)

# Test

In [None]:
test_model           = joblib.load(model_path)

## Loading data

In [None]:
test_data = ['{"name":"Светлана Петухова","screen_name":"svpetuhova26623","summary":"","lang":"ru","favourites_count":23,"statuses_count":14,"friends_count":2,"followers_count":1,"listed_count":1,"verified":0}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}',
             '{"lang":"en","summary":"Artist, Writer, Designer. Tweets on tech, culture, art, animals, love the socioeconomy.","verified":0,"followers_count":175,"friends_count":397,"favourites_count":228,"statuses_count":410,"listed_count":12,"name":"Daniel Adornes","screen_name":"daniel_adornes"}']
test_data = [json.loads(t) for t in test_data]
test_data = pd.DataFrame(test_data)

In [None]:
test_data

## Predict

In [None]:
result = test_model.predict_proba(test_data)

In [None]:
pd.DataFrame(result, columns=["business","person"])