In [19]:
import json
import pandas as pd
import numpy as np
from hashlib import sha256
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
import spacy
from spacy.matcher import Matcher
from nltk.corpus import stopwords

In [115]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self
    

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]
    
    
class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        print(self.key)
        return X[[self.key]]
    
    

In [89]:
df = pd.read_csv('BLU09 - Information Extraction/datasets/train_tweets.csv', encoding='latin1')
df.drop(["id"], axis=1,inplace=True)
df=df.dropna()
#df.rename(columns={"v1":"label", "v2":"message"},inplace=True)
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [3]:
# load the medium-sized SpaCy model
nlp = spacy.load('en_core_web_md')

In [5]:
# Create a list of SpaCy "Docs" by leveraging the SpaCy pipeline
docs = list(nlp.pipe(df.tweet))

In [6]:
df.label.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [9]:
map(str, train_data['tweet'].values)

<map at 0x7f5669a00940>

In [42]:
# Split in train and validation
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [43]:
# Build the pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                   ('classifier', RandomForestClassifier(random_state = 42))])

# Train the classifier
text_clf.fit(map(str, train_data['tweet'].values), train_data['label'].values)

predicted = text_clf.predict(map(str, test_data['tweet'].values))
np.mean(predicted == test_data['label'])



0.9524479899890506

### Adding extra features

In [20]:
stop_words = set(stopwords.words('english'))


In [111]:
df['length'] = df['tweet'].map(len)
df['words'] = df['tweet'].str.split().map(len)
df['commas'] = df['tweet'].str.count(',')
df['upper'] = df['tweet'].map(lambda x: map(str.isupper, x)).map(sum)
df['capitalized'] = df['tweet'].map(lambda x: map(str.istitle, x)).map(sum)

df['words_not_stopword'] = df['tweet'].apply(lambda x: len([t for t in x.split() if t not in stop_words]))

df['avg_word_length'] = df['tweet'].apply(lambda x: np.mean([len(t) for t in x.split() if t not in stop_words]) if len([len(t) for t in x.split(' ') if t not in stop_words]) > 0 else 0)

df = df.dropna()

In [112]:
#ax_list = df.hist(column='length', by='label', bins=50,figsize=(12,4))
#ax_list[0].set_xlim((0,300))
#ax_list[1].set_xlim((0,300))

In [121]:
text = Pipeline([
                ('selector', TextSelector("tweet")),
                ('tfidf', TfidfVectorizer())
            ])

length =  Pipeline([
                ('selector', NumberSelector("length")),
                ('standard', StandardScaler())
            ])
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler())
            ])
upper =  Pipeline([
                ('selector', NumberSelector(key='upper')),
                ('standard', StandardScaler())
            ])
capitalized =  Pipeline([
                ('selector', NumberSelector(key='capitalized')),
                ('standard', StandardScaler())
            ])

feats = FeatureUnion([('text', text), 
                     ('length', length),
                     ('words', words),
                     ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                    ('upper', upper),
                     ('capitalized', capitalized)])

feature_processing = Pipeline([('feats', feats)])

In [122]:
# Split in train and validation
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier())
])
pipeline.fit(X = train_data, y = train_data.label)

preds = pipeline.predict(test_data)
np.mean(preds == test_data.label)


length
words
words_not_stopword
avg_word_length
commas
upper
capitalized




length
words
words_not_stopword
avg_word_length
commas
upper
capitalized


0.9535429375879868

In [81]:
train_data

Unnamed: 0,label,tweet,length,words,commas,upper,capitalized,words_not_stopword,avg_word_length
12110,1,@user @user @user because i've been paying att...,94,16,0,3,3,11,5.454545
14081,0,@user raft building at salford quays as pa of...,89,16,0,0,0,12,5.166667
1829,0,friday Ã°ÂÂÂÃ°ÂÂÂ» #gdegblog #friday #sel...,112,12,0,11,11,12,8.166667
2769,0,#fashion it is a true #fact,29,6,0,0,0,3,5.666667
31818,0,"so to #share the #simple, #elegant #business...",91,13,1,0,0,8,8.125000
9250,0,@user big update on @user colne uk european b...,106,17,0,0,0,14,5.714286
29624,0,five minutes to google translate and print a s...,87,15,0,0,0,10,5.800000
5505,0,lady_graces: berniesanders according to #gucc...,117,15,0,0,0,13,7.307692
1809,0,the simple things in life make one happy #smil...,82,12,0,0,0,10,6.200000
23277,0,#whoolo in film you can have sad endings. #an...,58,10,0,0,0,6,5.166667
