In [1]:
import pandas as pd
import re
import nltk
from collections import defaultdict # Dictionaries with default values
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gauri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
## importing data sets and dropping nan's
df = pd.read_csv('GFM_data.csv',sep = '\t')
df = df.loc[df['Text'].dropna().index]
df.head()

Unnamed: 0.1,Unnamed: 0,Url,Category,Position,Title,Location,Amount_Raised,Goal,Number_of_Donations,Length_of_Fundraising,FB_Shares,Number_of_Donors,Followers,Text
0,0,https://www.gofundme.com/f/justiceforjacobblake,Medical,0,Justice for Jacob Blake,"Kenosha, WI",2297930.0,3000000.0,73K,93 days 12:02:38.405126000,118K,72.5K,73.4K,On August 23rd my son was shot multiple times ...
1,0,https://www.gofundme.com/f/official-navajo-nat...,Medical,0,Official Navajo Nation COVID-19 Relief Fund,"Window Rock, AZ",1862040.0,1000000.0,22.5K,205 days 12:02:39.366241000,71.7K,21.9K,22K,\r\nThe Navajo Nation COVID-19 Fund has been e...
2,0,https://www.gofundme.com/f/help-a-front-line-n...,Medical,0,Help a front line nurse and baby get proper care,"Randolph, NJ",954793.0,1200000.0,19K,215 days 12:02:40.340314000,16.4K,18.3K,17.9K,"On Sunday, April 12, Sylvia Leroy, a pregnant ..."
3,0,https://www.gofundme.com/f/Tommy-Rivers-Rest-Up,Medical,1,"Rest up, Tommy, we'll see you soon","Scottsdale, AZ",673179.0,1000000.0,11.3K,131 days 12:02:41.464483000,21.3K,10.3K,10.4K,"First, thank you for being here. Tommy Rivers ..."
4,0,https://www.gofundme.com/f/brandon039s-medical...,Medical,1,OFFICIAL BRANDON SAENZ MEDICAL FUND,"Tyler, TX",570529.0,750000.0,24.7K,175 days 12:02:42.383091000,5.5K,24.3K,24.5K,My name is Melissa Green and I am the mother o...


## Text preprocessing using Regex

In [3]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')

def extract_entities(text):
    names = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                names.append(' '.join(c[0] for c in chunk.leaves()))
    new_text = text
    for name in names:
        if name in text:
            new_text = new_text.replace(name, 'NLP')
    return new_text

def clean_text(x):
    ## removing names
    x = extract_entities(x)
    ## normalizing text by stripping white space and lower casing
    x =  x.lower().strip()
    ## removing urls
    x = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', x)
    ## removing phone numbers
    x = re.sub('\([0-9]{3}\)\s*[0-9]{3}-[0-9]{4}','',x)
    ## strip all non alphanumeric things
    x = re.sub('\n',' ',x)
    x = re.sub("[^a-zA-Z0-9 #]",'',x)
    x = re.sub("\s+",' ',x)
    text = x.replace('\n', ' ').lower()# lowercase text
    text = REPLACE_IP_ADDRESS.sub('', text) # remove ip address
    text = REPLACE_BY_SPACE_RE.sub(' ',text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([w for w in text.split() if not w in STOPWORDS])# delete stopwords from text
    
    return text

In [4]:
text = df['Text'][4]
new_text = clean_text(text)
df['Text'] = df['Text'].apply(clean_text)
print(text)
print()
print(new_text)




## Splitting into train and test set

In [5]:
categories = {i: idx for idx,i in enumerate(df['Category'].unique())}

In [6]:
from sklearn.model_selection import train_test_split
X = df['Text']
y = [categories[i] for i in df['Category']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## BoW (Bag of Words)


In [7]:
from scipy import sparse as sp_sparse

In [8]:
# Dictionary of all words from train corpus with their counts.
words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 0
        words_counts[word] += 1
##most pop words        
DICT_SIZE = 10000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
## same dics but flopped
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [9]:
def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_test_mybag.shape)

X_train shape  (646, 10000) 
X_val shape  (216, 10000)


In [10]:
print(X_train_mybag[0])


  (0, 0)	27.0
  (0, 1)	1.0
  (0, 3)	5.0
  (0, 4)	5.0
  (0, 5)	4.0
  (0, 6)	2.0
  (0, 7)	2.0
  (0, 8)	3.0
  (0, 9)	5.0
  (0, 10)	2.0
  (0, 11)	1.0
  (0, 13)	3.0
  (0, 14)	1.0
  (0, 17)	3.0
  (0, 18)	1.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 21)	2.0
  (0, 22)	1.0
  (0, 25)	2.0
  (0, 29)	1.0
  (0, 33)	1.0
  (0, 36)	1.0
  (0, 41)	1.0
  (0, 43)	4.0
  :	:
  (0, 6877)	1.0
  (0, 6878)	1.0
  (0, 6879)	1.0
  (0, 6880)	1.0
  (0, 6881)	1.0
  (0, 6882)	1.0
  (0, 6883)	1.0
  (0, 6884)	1.0
  (0, 6885)	1.0
  (0, 6886)	1.0
  (0, 6887)	1.0
  (0, 6888)	1.0
  (0, 6889)	1.0
  (0, 6890)	1.0
  (0, 6891)	1.0
  (0, 6892)	1.0
  (0, 6893)	1.0
  (0, 6894)	1.0
  (0, 6895)	1.0
  (0, 6896)	1.0
  (0, 6897)	1.0
  (0, 6898)	1.0
  (0, 6899)	1.0
  (0, 6900)	1.0
  (0, 6901)	1.0


In [11]:
POPULAR_WORDS[:10]

['nlp',
 'help',
 'nlps',
 'family',
 'us',
 'support',
 'community',
 'time',
 'many',
 'years']

## TF-IDF¶


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
## creating tfidf vector
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5)
## transforming it
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [14]:
print(X_train_tfidf[0])

  (0, 3450)	0.05330902462212366
  (0, 92)	0.10549788875949537
  (0, 2298)	0.057360000299316836
  (0, 3442)	0.04413319827621207
  (0, 73)	0.06351011300383887
  (0, 3987)	0.06839173208596769
  (0, 1259)	0.10371794737266439
  (0, 2123)	0.10355856513852879
  (0, 1716)	0.059599019618880145
  (0, 510)	0.1380780868513717
  (0, 1189)	0.03916272599342111
  (0, 2897)	0.2819240170137015
  (0, 114)	0.06839173208596769
  (0, 4201)	0.0404706289932095
  (0, 1625)	0.03488623698750032
  (0, 499)	0.04262628985226823
  (0, 2927)	0.07832545198684222
  (0, 4250)	0.11865510756390832
  (0, 3239)	0.06351011300383887
  (0, 1172)	0.07502177135205362
  (0, 3860)	0.043401758506643406
  (0, 1453)	0.04942297692192443
  (0, 3710)	0.07886079137268459
  (0, 4164)	0.06653582664522843
  (0, 712)	0.047266276840831505
  :	:
  (0, 4269)	0.06004654914499174
  (0, 2626)	0.09042351305974611
  (0, 2429)	0.06839173208596769
  (0, 1262)	0.13307165329045686
  (0, 511)	0.055853091875373004
  (0, 2510)	0.06839173208596769
  (0, 289

## Classification

In [15]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
def train_classifier(X_train, y_train, C, regularisation):
    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model

In [16]:
classifier_mybag = train_classifier(X_train_mybag, y_train, C = 4, regularisation = 'l2')
classifier_tfidf = train_classifier(X_train_tfidf, y_train, C = 4, regularisation = 'l2')



In [17]:
y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)
y_test_predicted_labels_tfidf = classifier_tfidf.predict(X_test_tfidf)

In [18]:
y_test_predicted_scores_mybag = classifier_mybag.decision_function(X_test_mybag)
y_test_predicted_scores_tfidf = classifier_tfidf.decision_function(X_test_tfidf)

In [19]:
y_test_predicted_scores_mybag

array([[ -7.8984405 ,  -4.56843965,  -7.20291397, ...,  -7.79626134,
         -5.21450572,  -4.88280427],
       [ -5.88841381,  -7.92511479,  -3.33961138, ...,  -3.35376249,
         -4.3727636 ,  -2.72938289],
       [-10.46610227,  -0.80775191,  -6.07997797, ...,  -5.10231465,
         -6.84914677,  -5.86018039],
       ...,
       [ -4.59829906,  -5.85600628,  -4.57474017, ...,  -6.36047193,
         -2.94255695,  -4.14086362],
       [ -6.46634004,  -7.75027993,  -4.40796338, ...,  -3.39192914,
         -4.30187695,  -5.77795586],
       [ -6.10529502,  -6.06100736,  -7.17042775, ...,  -2.57392758,
         -6.64355046,  -5.60570594]])

## Evaluation


In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [21]:
classifier_tfidf = train_classifier(X_train_tfidf, y_train, C = 50, regularisation = 'l2')

y_test_predicted_labels_mybag = classifier_mybag.predict(X_test_mybag)
y_test_predicted_labels_tfidf = classifier_tfidf.predict(X_test_tfidf)

def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=False))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    
print('Bag-of-words\n')
print_evaluation_scores(y_test, y_test_predicted_labels_mybag)
print('\nTfidf\n')
print_evaluation_scores(y_test, y_test_predicted_labels_tfidf)

Bag-of-words

Accuracy:  102
F1-score weighted:  0.331068799235176

Tfidf

Accuracy:  96
F1-score weighted:  0.3110173676505555


  'precision', 'predicted', average, warn_for)
