In [1]:
import pandas as pd
import re
import nltk
from collections import defaultdict # Dictionaries with default values
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
## importing data sets and dropping nan's
df = pd.read_csv('GFM_data.csv',sep = '\t')
df = df.loc[df['Text'].dropna().index]
df.head()

Unnamed: 0.1,Unnamed: 0,Url,Category,Position,Title,Location,Amount_Raised,Goal,Number_of_Donators,Length_of_Fundraising,FB_Shares,GFM_hearts,Text,Latitude,Longitude
0,0,https://www.gofundme.com/3ctqm-medical-bills-f...,Medical,0,92 Yr old Man Brutally Attacked.,"LOS ANGELES, CA",327345.0,15000,12167,1 month,26k,12k,Rodolfo Rodriguez needs your help today! 92 Yr...,34.052234,-118.243685
1,1,https://www.gofundme.com/olivia-stoy-bone-marr...,Medical,0,Olivia Stoy:Transplant & Liv it up!,"ASHLEY, IN",316261.0,1.0M,5598,3 months,12k,5.7k,Thomas Stoy needs your help today! Olivia Stoy...,41.527272,-85.065523
2,2,https://www.gofundme.com/autologous-Tcell-Tran...,Medical,1,AUTOLOGOUS T CELL TRANSPLANT,"STATEN ISLAND, NY",241125.0,250000,841,2 months,1.8k,836,Philip Defonte needs your help today! AUTOLOGO...,40.579532,-74.150201
3,3,https://www.gofundme.com/a-chance-of-rebirth,Medical,1,A chance of rebirth,"DUBLIN, CA",237424.0,225000,4708,1 month,9.7k,4.7k,Sriram Kanniah needs your help today! A chance...,37.702152,-121.935792
4,4,https://www.gofundme.com/teamclaire,Medical,1,Claire Wineland Needs Our Help,"GARDEN GROVE, CA",236590.0,225000,8393,2 months,6.4k,8.9k,Melissa Yeager needs your help today! Claire W...,33.774269,-117.937995


## Text preprocessing using Regex

In [3]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')

def extract_entities(text):
    names = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                names.append(' '.join(c[0] for c in chunk.leaves()))
    new_text = text
    for name in names:
        if name in text:
            new_text = new_text.replace(name, 'NLP')
    return new_text

def clean_text(x):
    ## removing names
    x = extract_entities(x)
    ## normalizing text by stripping white space and lower casing
    x =  x.lower().strip()
    ## removing urls
    x = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', x)
    ## removing phone numbers
    x = re.sub('\([0-9]{3}\)\s*[0-9]{3}-[0-9]{4}','',x)
    ## strip all non alphanumeric things
    x = re.sub('\n',' ',x)
    x = re.sub("[^a-zA-Z0-9 #]",'',x)
    x = re.sub("\s+",' ',x)
    text = x.replace('\n', ' ').lower()# lowercase text
    text = REPLACE_IP_ADDRESS.sub('', text) # remove ip address
    text = REPLACE_BY_SPACE_RE.sub(' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([w for w in text.split() if not w in STOPWORDS])# delete stopwords from text
    
    return text

In [4]:
text = df['Text'][4]
new_text = clean_text(text)
df['Text'] = df['Text'].apply(clean_text)
print(text)
print()
print(new_text)

Melissa Yeager needs your help today! Claire Wineland Needs Our Help - First of all, WOW!! We have been blown away by the amazing outpouring of love and support you have shown to Claire..deeply humbled in fact.Â  Friends have encouraged us to raise the goal to meet the demand but I want everyone to know that our transplant finances have been covered in full.Â  Anythin...

nlp nlp needs help today nlp nlp needs nlp first nlp blown away amazing outpouring love support shown nlpdeeply humbled fact friends encouraged us raise goal meet demand want everyone know transplant finances covered full nlp


## Splitting into train and test set

In [5]:
categories = {i: idx for idx,i in enumerate(df['Category'].unique())}

In [6]:
from sklearn.model_selection import train_test_split
X = df['Text']
y = [categories[i] for i in df['Category']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## BoW (Bag of Words)


In [7]:
from scipy import sparse as sp_sparse

In [8]:
# Dictionary of all words from train corpus with their counts.
words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 0
        words_counts[word] += 1
##most pop words        
DICT_SIZE = 10000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
## same dics but flopped
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [9]:
def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_test_mybag.shape)

X_train shape  (927, 10000) 
X_val shape  (310, 10000)


In [10]:
print(X_train_mybag[0])


  (0, 0)	8.0
  (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (0, 116)	1.0
  (0, 214)	1.0
  (0, 347)	1.0
  (0, 348)	3.0
  (0, 523)	2.0
  (0, 686)	2.0
  (0, 805)	1.0
  (0, 980)	2.0
  (0, 1234)	3.0
  (0, 1235)	1.0
  (0, 1236)	2.0
  (0, 1237)	1.0
  (0, 1687)	1.0
  (0, 1688)	1.0
  (0, 1689)	1.0
  (0, 1690)	1.0
  (0, 2549)	1.0
  (0, 2550)	1.0
  (0, 2551)	1.0
  (0, 2552)	1.0
  (0, 2553)	1.0
  (0, 2554)	1.0
  (0, 2555)	1.0
  (0, 2556)	1.0
  (0, 2557)	1.0
  (0, 2558)	1.0
  (0, 2559)	1.0
  (0, 2560)	1.0
  (0, 2561)	1.0
  (0, 2562)	1.0
  (0, 2563)	1.0
  (0, 2564)	1.0
  (0, 2565)	1.0
  (0, 2566)	1.0
  (0, 2567)	1.0
  (0, 2568)	1.0
  (0, 2569)	1.0
  (0, 2570)	1.0
  (0, 2571)	1.0
  (0, 2572)	1.0


In [11]:
POPULAR_WORDS[:10]

['nlp',
 'help',
 'needs',
 'today',
 'nlps',
 'family',
 'years',
 '2018',
 'us',
 'support']

## TF-IDF¶


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
## creating tfidf vector
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5)
## transforming it
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [14]:
print(X_train_tfidf[0])

  (0, 663)	0.18353528942150774
  (0, 553)	0.16712875883324108
  (0, 1185)	0.20182236858230343
  (0, 928)	0.6729898429180977
  (0, 563)	0.4372117722921825
  (0, 326)	0.4486598952787318
  (0, 783)	0.0419088341372966
  (0, 1126)	0.052349394245470186
  (0, 554)	0.21860588614609125


## Classification

In [15]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
def train_classifier(X_train, y_train, C, regularisation):
    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model

In [16]:
classifier_mybag = train_classifier(X_train_mybag, y_train, C = 4, regularisation = 'l2')
classifier_tfidf = train_classifier(X_train_tfidf, y_train, C = 4, regularisation = 'l2')



In [17]:
y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)
y_test_predicted_labels_tfidf = classifier_tfidf.predict(X_test_tfidf)

In [18]:
y_test_predicted_scores_mybag = classifier_mybag.decision_function(X_test_mybag)
y_test_predicted_scores_tfidf = classifier_tfidf.decision_function(X_test_tfidf)

In [19]:
y_test_predicted_scores_mybag

array([[ -7.8984405 ,  -4.56843965,  -7.20291397, ...,  -7.79626134,
         -5.21450572,  -4.88280427],
       [ -5.88841381,  -7.92511479,  -3.33961138, ...,  -3.35376249,
         -4.3727636 ,  -2.72938289],
       [-10.46610227,  -0.80775191,  -6.07997797, ...,  -5.10231465,
         -6.84914677,  -5.86018039],
       ...,
       [ -4.59829906,  -5.85600628,  -4.57474017, ...,  -6.36047193,
         -2.94255695,  -4.14086362],
       [ -6.46634004,  -7.75027993,  -4.40796338, ...,  -3.39192914,
         -4.30187695,  -5.77795586],
       [ -6.10529502,  -6.06100736,  -7.17042775, ...,  -2.57392758,
         -6.64355046,  -5.60570594]])

## Evaluation


In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [21]:
classifier_tfidf = train_classifier(X_train_tfidf, y_train, C = 50, regularisation = 'l2')

y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)
y_test_predicted_labels_tfidf = classifier_tfidf.predict(X_test_tfidf)

def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    
print('Bag-of-words\n')
print_evaluation_scores(y_test, y_test_predicted_labels_mybag)
print('\nTfidf\n')
print_evaluation_scores(y_test, y_test_predicted_labels_tfidf)

Bag-of-words

Accuracy:  102
F1-score weighted:  0.331068799235176

Tfidf

Accuracy:  96
F1-score weighted:  0.3110173676505555


  'precision', 'predicted', average, warn_for)
