In [2]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, RidgeClassifier, Perceptron, PassiveAggressiveClassifier, LinearRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.base import BaseEstimator
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as sw
from nltk.tokenize import TweetTokenizer
import nltk
import re
import string
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

%matplotlib inline
sns.set_style("white")

FEATURE_LIST = ['created_at', 'id', 'full_text', 'user', 'retweet_count', 'favorite_count', 'coordinates', 'place', 'class']
N_BINS = 18
IMAGE_PATH = 'images/'
RESULTS_PATH = 'results/'
SUBMISSIONS_PATH = 'submissions/'

stopwords = sw.words('english') + ["'d", "'ll", "'re", "'s", "'ve", 'doe', 'ha', "n't", 'sha', 'wa', 'wo']



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
class LemmaTokenizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.tokenizer = TweetTokenizer()

    def __call__(self, document):
        lemmas = []

        for t in self.tokenizer.tokenize(document):
            t.strip()
            lemma = self.lemmatizer.lemmatize(t)

            # remove tokens with only punctuation
            if lemma not in string.punctuation:
                lemmas.append(lemma)

        return lemmas

class ClfSwitcher(BaseEstimator):

    def __init__(
        self,
        estimator = SGDClassifier(),
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """

        self.estimator = estimator


    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self


    def predict(self, X, y=None):
        return self.estimator.predict(X)


    def predict_proba(self, X):
        return self.estimator.predict_proba(X)


    def score(self, X, y):
        return self.estimator.score(X, y)

In [4]:
training_set = pd.read_json('development.jsonl', lines=True)
training_set = training_set[FEATURE_LIST]

test_set = pd.read_json('evaluation.jsonl', lines=True)

In [28]:
from sklearn.preprocessing import KBinsDiscretizer

tweets = training_set["full_text"].tolist()
desc_list = [item['description'] for item in training_set["user"]]

test_tweets = test_set["full_text"].tolist()
test_desc_list = [item['description'] for item in test_set["user"]]

train_len = [len(t) for t in tweets]
test_len = [len(t) for t in tweets]

train_len = np.array(train_len).reshape(-1,1)
test_len = np.array(test_len).reshape(-1,1)

disc_len = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')

X_train_len = disc_len.fit_transform(train_len)
X_test_len = disc_len.transform(test_len)

X_train_len = np.array_str(X_train_len.astype(int))
X_test_len = np.array_str(X_test_len.astype(int))

X_train = [' '.join(z) for z in zip(tweets, desc_list)]
X_test = [' '.join(z) for z in zip(test_tweets, test_desc_list)]

In [None]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(tokenizer=LemmaTokenizer(), binary=True)),
    ('clf', ClfSwitcher()),
])


parameters = [
    {
        "tfidf__min_df" : [1],
        "tfidf__ngram_range" : [(1,4)],
        'clf__estimator': [SGDClassifier(random_state=42)],
        'clf__estimator__tol': [1e-3],
        'clf__estimator__alpha': [1e-5],
        'clf__estimator__loss': ['hinge'],
    },
]

gscv = GridSearchCV(pipeline, parameters, cv=3, n_jobs=-1, return_train_score=False, verbose=5)
gscv.fit(X_train, training_set['class'])

gscv.cv_results_

In [95]:
# TODO function wrapper

results = pd.DataFrame(gscv.cv_results_)
results = results[['param_clf__estimator','param_clf__estimator__alpha','param_clf__estimator__tol','param_clf__estimator__loss','param_tfidf__min_df',	'param_tfidf__ngram_range','mean_test_score']]
results = results.sort_values(by=['mean_test_score'], ascending=False)

results.to_excel("results_pipeline_SGDdesc#2.xlsx")

In [96]:

## TODO function wrapper
predictions = gscv.predict(X_test)
pred = pd.DataFrame()
pred.insert(0, "Predicted", predictions, True)

pred.to_csv('submission_SGDdesc#2.csv',sep=',', index_label='Id')



