## Preprocessing with scikit learn

In [13]:
import os
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [14]:
# Loading file from path
def loading_file():
    file_dir = '/home/nbuser/library/1. Classifier/3. Exploratory Data Analysis'        
    file_list = glob.glob(file_dir + '/*.csv')
    csv_file = file_list[0]
    return csv_file

# Import file imto Pandas DataFrame
def importing_file(csv_file):
    df = pd.read_csv(csv_file, sep=",")
    return df

# Saving path
def saving_file(file, file_name, save_dir):
    file.to_csv(os.path.join(save_dir,file_name))


In [15]:
# Importing file + Loading  file
news_df = importing_file(loading_file())

# Top 5 records
news_df.head()

Unnamed: 0,file_name,title,news_text,category
0,348.txt,Berlin celebrates European cinema,Organisers say this year's Berlin Film Festiva...,entertainment
1,139.txt,U2 to play at Grammy awards show,Irish rock band U2 are to play live at the Gra...,entertainment
2,125.txt,Snow Patrol feted at Irish awards,Snow Patrol were the big winners in Ireland's ...,entertainment
3,267.txt,T in the Park sells out in days,Tickets for Scotland's biggest music festival ...,entertainment
4,311.txt,Corbett attacks 'dumbed-down TV',Ronnie Corbett has joined fellow comedy stars ...,entertainment


In [24]:
news_df.category.unique()

array(['entertainment', 'politics', 'business', 'sports', 'tech'],
      dtype=object)

### Clean the news_text column

In [None]:
import re
def clean_text(x):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', x)
    return text

In [None]:
def clean_all_text(df):
    for index, item in df.iterrows():
        cleantext = clean_text(item['news_text'])
        item['news_text'] = cleantext
        
    return df

In [None]:
clean_df = clean_all_text(news_df)

In [None]:
print(news_df.iloc[22, 2])

#### CountVectorizer for text classification

In [21]:
# Import the necessary modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Create a series to store the labels: y
y = news_df["category"]

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(news_df["news_text"], y, test_size=0.3, random_state=53)

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english")

# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])


['00', '000', '0001', '000m', '000s', '000th', '0051', '007', '01', '0100']


#### TfidfVectorizer for text classification

In [22]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english",max_df=0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

print(tfidf_vectorizer.get_feature_names()[:10])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])


['00', '000', '0001', '000m', '000s', '000th', '0051', '007', '01', '0100']
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.03914722 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


#### Inspecting the vectors

In [23]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# Print the head of count_df
print(count_df.head())

# Print the head of tfidf_df
print(tfidf_df.head())

# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))


   00  000  0001  000m  000s  000th  0051  007  01  0100     ...       zones  \
0   0    0     0     0     0      0     0    0   0     0     ...           0   
1   0    0     0     0     0      0     0    0   0     0     ...           0   
2   0    1     0     0     0      0     0    0   0     0     ...           0   
3   0    0     0     0     0      0     0    0   0     0     ...           0   
4   0    0     0     0     0      0     0    0   0     0     ...           0   

   zoom  zooms  zooropa  zornotza  zubair  zurich  zutons  zvonareva  \
0     0      0        0         0       0       0       0          0   
1     0      0        0         0       0       0       0          0   
2     0      0        0         0       0       0       0          0   
3     0      0        0         0       0       0       0          0   
4     0      0        0         0       0       0       0          0   

   zvyagintsev  
0            0  
1            0  
2            0  
3            0  
4

#### Training and testing the "fake news" model with CountVectorizer

In [25]:
# Import the necessary modules
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['entertainment', 'politics', 'business', 'sports', 'tech'])
print(cm)


0.9733777038269551
[[ 43   1   1   0   3]
 [  0 129   3   0   1]
 [  0   0 138   0   4]
 [  0   0   1 152   0]
 [  1   1   0   0 123]]


#### Training and testing the "fake news" model with TfidfVectorizer

In [26]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['entertainment', 'politics', 'business', 'sports', 'tech'])
print(cm)


0.913477537437604
[[ 10   5   8  17   8]
 [  0 128   4   0   1]
 [  0   0 139   0   3]
 [  0   0   1 152   0]
 [  0   2   3   0 120]]


#### Improving your model


In [33]:
# Create the list of alphas: alphas
alphas = np.arange(0, 1, .1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
best_alfa_score = {}
for alpha in alphas:
    best_alfa_score[alpha] = train_and_predict(alpha)
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()
best_score = max(best_alfa_score, key=best_alfa_score.get)

print("Best alfa: ", best_score, "best accuracy: ",best_alfa_score[best_score])

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha:  0.0
Score:  0.9550748752079867

Alpha:  0.1
Score:  0.9717138103161398

Alpha:  0.2
Score:  0.9550748752079867

Alpha:  0.30000000000000004
Score:  0.9450915141430949

Alpha:  0.4
Score:  0.9334442595673876

Alpha:  0.5
Score:  0.9284525790349417

Alpha:  0.6000000000000001
Score:  0.9284525790349417

Alpha:  0.7000000000000001
Score:  0.9234608985024958

Alpha:  0.8
Score:  0.9217970049916805

Alpha:  0.9
Score:  0.9217970049916805

Best alfa:  0.1 best accuracy:  0.9717138103161398


#### Inspecting the model


In [28]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])


business [(-10.204576282558833, '00'), (-10.204576282558833, '0001'), (-10.204576282558833, '000m'), (-10.204576282558833, '000s'), (-10.204576282558833, '000th'), (-10.204576282558833, '0051'), (-10.204576282558833, '007'), (-10.204576282558833, '0100'), (-10.204576282558833, '0130'), (-10.204576282558833, '02'), (-10.204576282558833, '0227'), (-10.204576282558833, '028'), (-10.204576282558833, '033'), (-10.204576282558833, '04m'), (-10.204576282558833, '04secs'), (-10.204576282558833, '056'), (-10.204576282558833, '05m'), (-10.204576282558833, '060'), (-10.204576282558833, '072'), (-10.204576282558833, '080')]
entertainment [(-8.36079612343115, 'rates'), (-8.341688436601189, 'december'), (-8.33235401825379, 'rise'), (-8.229722130061955, 'government'), (-8.218499183038658, 'yukos'), (-8.178871053906013, 'china'), (-8.106851667128925, 'prices'), (-8.074253466673845, '2004'), (-8.069045380351342, 'shares'), (-8.052534937301782, 'economic'), (-8.042430513612883, 'firm'), (-8.002019779531