In [1]:
import os
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Import `fake_or_real_news.csv` 
df = pd.read_csv("https://s3.amazonaws.com/assets.datacamp.com/blog_assets/fake_or_real_news.csv")
    
# Inspect shape of `df` 
df.shape

# Print first lines of `df`
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
# Set index
df = df.set_index("Unnamed: 0") 

# Print first lines of `df` 
df.head()

Unnamed: 0_level_0,title,text,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
from sklearn.model_selection import train_test_split
#Set `y` 
y = df.label
 
# Drop the `label` column 
df.drop("label", axis=1) 
 
# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english') 

# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train) 

# Transform the test set
count_test = count_vectorizer.transform(X_test)

In [9]:
# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [10]:
# Get the feature names of `tfidf_vectorizer` 
print(tfidf_vectorizer.get_feature_names()[-10:])

# Get the feature names of `count_vectorizer` 
print(count_vectorizer.get_feature_names()[:10])

['حلب', 'عربي', 'عن', 'لم', 'ما', 'محاولات', 'من', 'هذا', 'والمرضى', 'ยงade']
['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']


In [11]:
# Loading file from path
def loading_file():
    file_dir = '/home/nbuser/library/1. Classifier/3. Exploratory Data Analysis'        
    file_list = glob.glob(file_dir + '/*.csv')
    csv_file = file_list[0]
    return csv_file

# Import file imto Pandas DataFrame
def importing_file(csv_file):
    df = pd.read_csv(csv_file, sep=",")
    return df

# Saving path
def saving_file(file, file_name, save_dir):
    file.to_csv(os.path.join(save_dir,file_name))

In [31]:
# Importing file + Loading  file
news_df = importing_file(loading_file())

# Top 5 records
news_df.head()

Unnamed: 0,file_name,title,news_text,category,news_length
0,380.txt,Boogeyman takes box office lead,The low-budget horror film Boogeyman has knock...,entertainment,1093
1,011.txt,Artists' secret postcards on sale,Postcards by artists including Damien Hirst an...,entertainment,1207
2,291.txt,U2 stars enter rock Hall of Fame,Singer Bruce Springsteen has inducted Irish ro...,entertainment,2364
3,184.txt,Top of the Pops leaves BBC One,The BBC's flagship pop music programme Top of ...,entertainment,2133
4,037.txt,Film row over Pirates 'cannibals',Plans to portray Dominica's Carib Indians as c...,entertainment,1514


In [32]:
news_df = news_df.reindex(np.random.permutation(news_df.index))

In [33]:
news_df.head()

Unnamed: 0,file_name,title,news_text,category,news_length
679,086.txt,Howard backs stem cell research,"Michael Howard has backed stem cell research, ...",politics,2342
1195,209.txt,Mixed reaction to Man Utd offer,Shares in Manchester United were up over 5% by...,business,2686
1597,414.txt,England 17-18 France,England suffered an eighth defeat in 11 Tests ...,sports,3923
1690,344.txt,Harinordoquy suffers France axe,Number eight Imanol Harinordoquy has been drop...,sports,2196
663,178.txt,Hatfield executives go on trial,Engineering firm Balfour Beatty and five railw...,politics,1249


In [37]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2225 entries, 679 to 357
Data columns (total 5 columns):
file_name      2225 non-null object
title          2225 non-null object
news_text      2225 non-null object
category       2225 non-null object
news_length    2225 non-null int64
dtypes: int64(1), object(4)
memory usage: 104.3+ KB


Text Classification With Scikit-Learn¶
https://e-string.com/articles/text-classification-with-sklearn/