# Classic NLP

1. Text normalization
2. Tokenization (or skip if applying with sklearn)
3. Preprocessing (stop words removal, stemming, lemmatization)
4. Feature Extraction/Vectorization (BoW, TF-IDF also tokenizes automatically on sklearn)
5. Model input

In [1]:
import pandas as pd

In [2]:
import sys
sys.path.append("..")  # points to project root

In [3]:
from src.data import load_data

In [4]:
filename = "../data/training_data.csv"

df = load_data(filename)
df.head()

Unnamed: 0,labels,text
0,0,donald trump sends out embarrassing new year‚s...
1,0,drunk bragging trump staffer started russian c...
2,0,sheriff david clarke becomes an internet joke ...
3,0,trump is so obsessed he even has obama‚s name ...
4,0,pope francis just called out donald trump duri...


In [5]:
X = df.text
y = df.labels

In [6]:
sample = X[10]
sample

'papa john‚s founder retires'

In [7]:
from src.data_cleaning import clean_text

clean_text(sample)

'papa john founder retires'

In [8]:
X = X.apply(clean_text)
X.head()

0    donald trump sends out embarrassing new year e...
1    drunk bragging trump staffer started russian c...
2    sheriff david clarke becomes an internet joke ...
3    trump is so obsessed he even has obama name co...
4    pope francis just called out donald trump duri...
Name: text, dtype: object

## Lemmatization

In [9]:
X.tail()

34147    tears in rain as thais gather for late king fu...
34148    pyongyang university needs non teachers as tra...
34149    philippine president duterte to visit japan ah...
34150    japan abe may have won election but many don w...
34151    demoralized and divided inside catalonia polic...
Name: text, dtype: object

In [10]:
from src.preprocessing import lemmatize

X_lemma = X.apply(lemmatize)
X_lemma.tail()

Missing wordnet. Downloading to /var/home/anne/Documents/_Ironhack/news-credibility-classifier/data/nltk_data...
Missing omw-1.4. Downloading to /var/home/anne/Documents/_Ironhack/news-credibility-classifier/data/nltk_data...
Missing averaged_perceptron_tagger. Downloading to /var/home/anne/Documents/_Ironhack/news-credibility-classifier/data/nltk_data...
Missing averaged_perceptron_tagger_eng. Downloading to /var/home/anne/Documents/_Ironhack/news-credibility-classifier/data/nltk_data...
Missing punkt. Downloading to /var/home/anne/Documents/_Ironhack/news-credibility-classifier/data/nltk_data...
Missing punkt_tab. Downloading to /var/home/anne/Documents/_Ironhack/news-credibility-classifier/data/nltk_data...


34147     tear in rain a thai gather for late king funeral
34148    pyongyang university need non teacher a travel...
34149    philippine president duterte to visit japan ah...
34150    japan abe may have won election but many don w...
34151    demoralize and divide inside catalonia police ...
Name: text, dtype: object

In [11]:
y.shape

(34152,)

## Train/test split 

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.2
random_state = 13

X_train, X_test, y_train, y_test = train_test_split(X_lemma, y,
                                                        test_size= test_size,
                                                        random_state=random_state)


In [13]:
X_train.shape

(27321,)

In [14]:
X_train.head()

4621     campaign in damage control mode after trump jr...
23049    virginia judge issue new injunction against tr...
19507    trump to nominate richard grenell to be ambass...
16067    crony corrupt politics obama admin block fbi f...
33249    saudi prince relieve from national guard once ...
Name: text, dtype: object

## Feature extraction

In [15]:
from src.feature_extraction import get_representation

vectorizer_type = "tfidf"

vectorizer_params = {"ngram_range": (1,2), "max_features": 1000, "max_df": 1.0}

X_train_vector, vectorizer, feature_names  = get_representation(X_train,
                                                    vectorizer_params["max_features"],
                                                    vectorizer_params["ngram_range"],
                                                    max_df = vectorizer_params["max_df"])



In [16]:
feature_names[:10]

array(['abortion', 'about', 'about the', 'about trump', 'abuse', 'accuse',
       'act', 'action', 'activist', 'ad'], dtype=object)

In [17]:
X_train_vector.toarray()

array([[0.       , 0.2302197, 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

## Train model

In [18]:
from sklearn.ensemble import RandomForestClassifier
randomclassifier=RandomForestClassifier(n_estimators=200,criterion='entropy')
randomclassifier.fit(X_train_vector,y_train)


## Save metrics

In [19]:
from src.evaluate import add_new_metrics

# train metrics
metrics_df = pd.DataFrame()
metrics_df = add_new_metrics(metrics_df, randomclassifier, X_train_vector, y_train, "train", "testing metrics", vectorizer_type, vectorizer_params)

# test metrics
X_test_vector = vectorizer.transform(X_test)
metrics_df = add_new_metrics(metrics_df, randomclassifier, X_test_vector, y_test, "test", "testing test metrics", vectorizer_type, vectorizer_params)


In [20]:
metrics_df

Unnamed: 0,model,split,vectorizer,accuracy,precision,recall,f1,comments,ngram_range,max_features,max_df
0,RandomForestClassifier,train,tfidf,0.999561,0.999399,0.9997,0.999549,testing metrics,"(1, 2)",1000,1.0
1,RandomForestClassifier,test,tfidf,0.906602,0.902111,0.902663,0.902387,testing test metrics,"(1, 2)",1000,1.0
