# NOTES
* we're doing the whole NLP process today

### Data Acquisition

In [14]:
import pandas as pd
original = pd.read_csv('economic_news.csv', encoding = 'ISO-8859-1')
required_data = original[['text', 'relevance']]
required_data

Unnamed: 0,text,relevance
0,NEW YORK -- Yields on most certificates of dep...,yes
1,The Wall Street Journal Online</br></br>The Mo...,no
2,WASHINGTON -- In an effort to achieve banking ...,no
3,The statistics on the enormous costs of employ...,no
4,NEW YORK -- Indecision marked the dollar's ton...,yes
...,...,...
7995,Secretary of Commerce Charles W. Sawyer said y...,yes
7996,"U.S. stocks inched up last week, overcoming co...",no
7997,Ben S. Bernanke cleared a key hurdle Thursday ...,no
7998,The White House's push to contract out many fe...,no


Removing invalid labels from the dataset based on a condition

In [15]:
required_data.drop(required_data.loc[required_data['relevance'] == "not sure"].index, inplace = True)
required_data['relevance'].value_counts()

relevance
no     6571
yes    1420
Name: count, dtype: int64

Largest class should be less than 2x smallest class

### Text Cleaning and Pre Processing

In [16]:
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [17]:
sample_doc = nlp("Hi, I like 3 Nolan's, two Cameron's & 12 Spielberg's movies.")
for token in sample_doc:
    #print(f"token: {token}\t\t| is punct? {token.is_punct}\t\t| like num? {token.like_num}\t\t| is digit? {token.is_digit}")
    print(f"{token}\t\t{token.is_stop}")

Hi		False
,		False
I		True
like		False
3		False
Nolan		False
's		True
,		False
two		True
Cameron		False
's		True
&		False
12		False
Spielberg		False
's		True
movies		False
.		False


In [None]:
def clean_preprocess(text):
    # Lowercase
    outtext = text.lower()
    # Remove unwanted/weird characters
    outtext = outtext.replace('</br>', ' ')
    # Remove punctuation and digits
    doc = nlp(outtext)
    outtext = [token.text for token in doc if (not token.is_punct) and (not token.like_num)]
    # remove stop words
    ## SPACY
    # text = [token.text for token in doc if (not token.is_stop)]
    ## SCIKIT
    outtext = [word for word in outtext if word not in ENGLISH_STOP_WORDS]
    processed_text = ' '.join(outtext) # join list of words into sentences
    return processed_text

In [19]:
clean_preprocess("I like Nolan's, and Spielberg's movies. They are 2 awesome")

"like nolan 's spielberg 's movies awesome"

In [20]:
required_data.text = required_data.text.apply(lambda row: clean_preprocess(row))

### Feature Engineering or Text Representation

In [21]:
from sklearn.feature_extraction.text import CountVectorizer # Bag of Words

In [47]:
sentence = ["dog likes cat", "dog eats meat", "cat eats meat", "dog bites man", "dog likes dog"]
vect = CountVectorizer()
x_data = vect.fit_transform(required_data.text)

In [None]:
print(required_data['relevance'])
required_data['relevance'] = required_data['relevance'].map({'yes':1, 'no':0})
y_data = required_data['relevance']

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
7995   NaN
7996   NaN
7997   NaN
7998   NaN
7999   NaN
Name: relevance, Length: 7991, dtype: float64

### Building a Model

In [37]:
# split dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, random_state=1)

In [None]:
from sklearn.naive_bayes import MultinomialNB

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
7995   NaN
7996   NaN
7997   NaN
7998   NaN
7999   NaN
Name: relevance, Length: 7991, dtype: float64

In [42]:
model = MultinomialNB()
model.fit(X_train, y_train)

ValueError: Input y contains NaN.

In [None]:
y_pred = model.predict(X_test)

### Evaluating the Model

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
accuracy_score(y_pred, y_test)

In [None]:
confusion_matrix(y_pred,y_test)