In [2]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, RidgeClassifier, Perceptron, PassiveAggressiveClassifier, LinearRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.base import BaseEstimator
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as sw
from nltk.tokenize import TweetTokenizer
import nltk
import re
import string
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

%matplotlib inline
sns.set_style("white")

FEATURE_LIST = ['created_at', 'id', 'full_text', 'user', 'retweet_count', 'favorite_count', 'coordinates', 'place', 'class']
N_BINS = 18
IMAGE_PATH = 'images/'

stopwords = sw.words('english') + ["'d", "'ll", "'re", "'s", "'ve", 'doe', 'ha', "n't", 'sha', 'wa', 'wo']



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
from nltk.stem.porter import PorterStemmer
class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.tokenizer = TweetTokenizer()

    def __call__(self, document):
        lemmas = []
        re_digit = re.compile("[0-9]") # regular expression to filter digit tokens
        re_emoji = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

        for t in self.tokenizer.tokenize(document):
            t.strip()
            lemma = self.lemmatizer.lemmatize(t)

            # remove tokens with only punctuation
            if lemma not in string.punctuation:
                lemmas.append(lemma)

        return lemmas

class ClfSwitcher(BaseEstimator):

    def __init__(
        self,
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [17]:
training_set = pd.read_json('development.jsonl', lines=True)
training_set = training_set[FEATURE_LIST]

test_set = pd.read_json('evaluation.jsonl', lines=True)

In [81]:
tweets = training_set["full_text"].tolist()
desc_list = [item['description'] for item in training_set["user"]]

test_tweets = test_set["full_text"].tolist()
test_desc_list = [item['description'] for item in test_set["user"]]

X_train = [' '.join(z) for z in zip(tweets, desc_list)]
X_test = [' '.join(z) for z in zip(test_tweets, test_desc_list)]

In [94]:

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(tokenizer=LemmaTokenizer(), binary=True)),
    ('clf', ClfSwitcher()),
])


parameters = [
    {
        "tfidf__min_df" : [1],
        "tfidf__ngram_range" : [(1,4)],
        'clf__estimator': [SGDClassifier(random_state=42)],
        'clf__estimator__tol': [1e-3],
        'clf__estimator__alpha': [1e-5],
        'clf__estimator__loss': ['hinge'],
    },
]

gscv = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, return_train_score=False, verbose=5)
gscv.fit(X_train, training_set['class'])

gscv.cv_results_

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 18.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 18.3min finished


{'mean_fit_time': array([ 82.05691417, 101.62800749, 109.83191037,  71.70315385,
         83.54212658,  96.63146345,  71.49457192,  81.90162865,
         97.45257179,  72.1416827 ,  85.6300536 , 105.78156662,
         77.29465564,  79.87803253,  95.91624149,  70.90332437,
         85.56139779,  95.33551391,  70.57609447,  87.73535514,
        107.53691037,  76.10936252,  87.95251163,  86.70856158]),
 'std_fit_time': array([ 0.10421577,  2.30089154, 11.07154024,  0.60771495,  0.85514258,
         1.69544298,  0.65355104,  1.12156968,  1.01598725,  0.47904903,
         3.82972276,  2.06934859,  3.20967891,  0.22906952,  0.91455907,
         0.35083433,  0.76669143,  1.3533195 ,  1.9402948 ,  0.29840547,
         3.84852318,  1.07212048,  2.04699308,  4.78193652]),
 'mean_score_time': array([33.71030664, 32.69957749, 33.95396765, 27.75951298, 30.6884199 ,
        33.68359462, 28.17405017, 32.56996695, 32.39769642, 28.1308376 ,
        34.6991214 , 32.4441061 , 28.39764826, 31.5356613 , 35

In [95]:
# TODO function wrapper

results = pd.DataFrame(gscv.cv_results_)
results = results[['param_clf__estimator','param_clf__estimator__alpha','param_clf__estimator__tol','param_clf__estimator__loss','param_tfidf__min_df',	'param_tfidf__ngram_range','mean_test_score']]
results = results.sort_values(by=['mean_test_score'], ascending=False)

results.to_excel("results_pipeline_SGDdesc#2.xlsx")

In [96]:

## TODO function wrapper
predictions = gscv.predict(X_test)
pred = pd.DataFrame()
pred.insert(0, "Predicted", predictions, True)

pred.to_csv('submission_SGDdesc#2.csv',sep=',', index_label='Id')



