In [None]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import re, os, random
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words

%matplotlib inline

# reproducible results
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',150)

root = '/Users/schwalmdaniel/github/kaggle/ml_training/session3'
#root = 'e:/kaggle/ml_training/session3'

# data explanation here: https://rstudio-pubs-static.s3.amazonaws.com/155304_cc51f448116744069664b35e7762999f.html

train=pd.read_csv(root + "/spam.csv", usecols=['v1','v2'])

# have a look at the ds
train.head()

In [None]:
# first let's see what is the shape of the data (cols, rows)

train.shape

In [None]:
# convert the target variable into numeric

train['v1'] = train['v1'].apply(lambda x: 1 if x == 'spam' else 0)

In [None]:
train['v1'].value_counts()

# 15% of the rows are spam

In [None]:
corpus = train['v2'].tolist()
corpus[:10]

In [None]:
n_features = 400
tfidf = TfidfVectorizer(max_features=n_features, min_df=2, analyzer='word',strip_accents='unicode')
tfidf.fit(corpus)

tfidf_train = np.array(tfidf.transform(train['v2']).todense(), dtype=np.float16)

for i in range(n_features):
    train['v2_tfidf_' + str(i)] = tfidf_train[:, i]
    
del tfidf_train

In [None]:
count_vect = CountVectorizer(max_features=n_features, min_df=2,
                            analyzer='word',strip_accents='unicode')
X_train_counts = count_vect.fit_transform(corpus)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
weights = np.asarray(X_train_tfidf.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': count_vect.get_feature_names(), 'weight': weights})
weights_df = weights_df.sort_values(ascending=False, by=['weight'])
weights_df.head(50)

In [None]:
train.head(10)

In [None]:
X = train.drop(['v1','v2'], axis=1)
y = train['v1']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42, shuffle=True)
print ('Training shape: %s, test shape: %s' % (X_train.shape, X_test.shape))

In [None]:
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)
predictions = forest.predict(X_test)

In [None]:
forest.score(X_test, y_test)

## Information extraction examples

### Part of Speech tagging

In [None]:
import nltk
from nltk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.tag import UnigramTagger, BigramTagger
from nltk.corpus import conll2000
from nltk.tokenize import sent_tokenize, word_tokenize

#nltk.download('conll2000')
#nltk.download('averaged_perceptron_tagger')

def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
        
    return backoff

def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    
    return [[(t,c) for (w,t,c) in sent] for sent in tagged_sents]

class TagChunker(ChunkParserI):
    
    def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]):
        train_sents = conll_tag_chunks(train_chunks)
        self.tagger = backoff_tagger(train_sents, tagger_classes)
        
    def parse(self, tagged_sent):
        if not tagged_sent:
            return None
        
        (words, tags) = zip(*tagged_sent)
        chunks = self.tagger.tag(tags)
        
        wtc = zip(words, chunks)
        
        return conlltags2tree([(w,t,c) for (w, (t,c)) in wtc])

# first we have to pre-train our model from NLTK factory training set    
    
conll_train = conll2000.chunked_sents('train.txt')
chunker = TagChunker(conll_train)

In [None]:
# then we can POS tag and visualize our sentence
# you can find the description of all POS tags here: 
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

inputString = 'That is one small step for man, one giant leap for mankind'

pos_tagged = nltk.pos_tag(word_tokenize(inputString))
chunker.parse(pos_tagged)

### Named Entity Recognition

In [None]:
from pprint import pprint
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

# python -m spacy download en

sentence = '''European authorities fined Google a record $5.1 billion on Wednesday for abusing 
its power in the mobile phone market and ordered the company to alter its practices'''

displacy.render(nlp(str(sentence)), jupyter=True, style='ent')

### Dependency parsing

In [None]:
displacy.render(nlp('I want an early upgrade'), jupyter=True, style='dep')

### Sentiment Analysis

In [None]:
from textblob import TextBlob

testimonial = TextBlob("The teacher is beautiful!")
testimonial.sentiment

In [None]:
testimonial = TextBlob("The teacher is ugly!")
testimonial.sentiment

In [None]:
testimonial = TextBlob("The population of Hungary is 10 million")
testimonial.sentiment

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [1, 5, 10],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [1.0],
    'n_estimators': [100,500, 900]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
