In [None]:
import pandas as pd
import numpy as np
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import re, os, random
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.metrics import confusion_matrix

%matplotlib inline

# reproducible results
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',150)
pd.set_option('max_colwidth', 1000)

#root = '/Users/schwalmdaniel/github/kaggle/ml_training/session3'
root = 'C:/Users/Administrator/kaggle/ml_training/session3'

# data explanation here: https://rstudio-pubs-static.s3.amazonaws.com/155304_cc51f448116744069664b35e7762999f.html

train=pd.read_csv(root + "/spam.csv", usecols=['v1','v2'])

# have a look at the ds
train.head(20)

In [None]:
# first let's see what is the shape of the data (cols, rows)

train.shape

In [None]:
# convert the target variable into numeric

train['v1'] = train['v1'].apply(lambda x: 1 if x == 'spam' else 0)

In [None]:
train['v1'].value_counts()

# 15% of the rows are spam

In [None]:
# the most obvious numeric feature of a text is its length

train['v2_len'] = train['v2'].apply(lambda x: len(x.strip()))

In [None]:
# convert everything to lowercase

train['v2'] = train['v2'].str.lower()

In [None]:
train['no_of_num'] =  train['v2'].apply(lambda x : sum([1 for ch in x if ch in string.digits]))
#train['no_of_punc'] =  train['v2'].apply(lambda x : sum([1 for ch in x if ch in ['?','!']]))
#train['no_of_punc'] =  train['v2'].apply(lambda x : sum([int(ch) for ch in x if ch in string.]))

In [None]:
corpus = train['v2'].tolist()
corpus[:10]

import nltk
nltk.download('stopwords')

In [None]:
n_features = 450
mystopwords = (stopwords.words('english')) + list(stop_words.ENGLISH_STOP_WORDS)
tfidf = TfidfVectorizer(max_features=n_features,stop_words=mystopwords)
tfidf.fit(corpus)

tfidf_train = np.array(tfidf.transform(train['v2']).todense(), dtype=np.float16)

for i in range(n_features):
    train['v2_tfidf_' + str(i)] = tfidf_train[:, i]
    
del tfidf_train

In [None]:
count_vect = CountVectorizer(max_features=n_features,stop_words=mystopwords)
X_train_counts = count_vect.fit_transform(corpus)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
weights = np.asarray(X_train_tfidf.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': count_vect.get_feature_names(), 'weight': weights})
weights_df = weights_df.sort_values(ascending=False, by=['weight'])
weights_df.head(50)

In [None]:
# as the textual feature weights are all between 0 and 1, rescale all other numeric features to this range

scaler = MinMaxScaler()
scaler.fit(train[['v2_len','no_of_num']])
train[['v2_len','no_of_num']] = scaler.transform(train[['v2_len','no_of_num']] )

In [None]:
# fn fp values

In [None]:
train.head(10)

In [None]:
df = train[['v1','v2']]
df.head()

In [None]:
X = train.drop(['v1','v2'], axis=1)
y = train['v1']

In [None]:
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, df.index, test_size=0.10, random_state=42, shuffle=True)
print ('Training shape: %s, test shape: %s' % (X_train.shape, X_test.shape))

In [None]:
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train, y_train)
predictions = forest.predict(X_test)

In [None]:
forest.score(X_test, y_test)

In [None]:
conf_mat = confusion_matrix(y_test, predictions)
sns.heatmap(conf_mat, annot=True, fmt=".0f")
plt.show()

In [None]:
# we predicted spam but not spam
mis = df.loc[indices_test[(y_test == 0) & (predictions == 1)]][['v2']]
mis.head()

In [None]:
# we predicted not spam but spam
mis = df.loc[indices_test[(y_test == 1) & (predictions == 0)]][['v2']]
mis.head()

## Information extraction examples

### Part of Speech tagging

In [None]:
import nltk
from nltk import ChunkParserI
from nltk.chunk.util import tree2conlltags, conlltags2tree
from nltk.tag import UnigramTagger, BigramTagger
from nltk.corpus import conll2000
from nltk.tokenize import sent_tokenize, word_tokenize

#nltk.download('conll2000')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('punkt')

def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
        
    return backoff

def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    
    return [[(t,c) for (w,t,c) in sent] for sent in tagged_sents]

class TagChunker(ChunkParserI):
    
    def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]):
        train_sents = conll_tag_chunks(train_chunks)
        self.tagger = backoff_tagger(train_sents, tagger_classes)
        
    def parse(self, tagged_sent):
        if not tagged_sent:
            return None
        
        (words, tags) = zip(*tagged_sent)
        chunks = self.tagger.tag(tags)
        
        wtc = zip(words, chunks)
        
        return conlltags2tree([(w,t,c) for (w, (t,c)) in wtc])

# first we have to pre-train our model from NLTK factory training set    
    
conll_train = conll2000.chunked_sents('train.txt')
chunker = TagChunker(conll_train)

In [None]:
# then we can POS tag and visualize our sentence
# you can find the description of all POS tags here: 
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

inputString = 'That is one small step for man, one giant leap for mankind'

pos_tagged = nltk.pos_tag(word_tokenize(inputString))
chunker.parse(pos_tagged)

### Named Entity Recognition

In [None]:
from pprint import pprint
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

# python -m spacy download en

sentence = '''European Union fined Google for a record of $5.1bn on Monday'''

displacy.render(nlp(str(sentence)), jupyter=True, style='ent')

### Dependency parsing

In [None]:
displacy.render(nlp(sentence), jupyter=True, style='dep')

### Sentiment Analysis

In [None]:
from textblob import TextBlob

testimonial = TextBlob("The teacher is beautiful!")
testimonial.sentiment

In [None]:
testimonial = TextBlob("The teacher is very ugly!")
testimonial.sentiment

In [None]:
testimonial = TextBlob("That is crap. Some think the population of Hungary is 10 million")
testimonial.sentiment