In [1]:
import pickle
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

sns.set_style("whitegrid")
# import altair as alt
# alt.renderers.enable("notebook")

# Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_rows = 4000

In [2]:
# Downloading punkt and wordnet from NLTK
nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
------------------------------------------------------------
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv('cleanish_data.csv')

In [4]:
df.head()

Unnamed: 0,publication,id,title,publication.1,year,content,content_len
0,Breitbart,29141,GOP House Majority Whip Scalise on Forgoing Au...,Breitbart,2017.0,On Sunday’s broadcast of Fox News Channel’s “S...,2034
1,Breitbart,38328,Megan Rapinoe Stands for Thailand Anthem Befor...,Breitbart,2016.0,Megan Rapinoe knelt for “The Star Spangled Ban...,2704
2,Breitbart,29992,Trump Administration Solicits Border Wall Prop...,Breitbart,2017.0,The Department of Homeland Security wants comp...,2890
3,Breitbart,46302,Mark Levin: Trump the Globalist - Breitbart,Breitbart,2016.0,Mark Levin writes at Conservative Review: One...,2150
4,Breitbart,26688,"Laptop with Trump Tower Floor Plans, National ...",Breitbart,2017.0,The New York Daily News cited police sources o...,2699


In [5]:
df['content_full'] = df['title'] + ' ' + df['content']

In [6]:
df['content_cleaned_1'] = df['content_full'].str.replace("    ", " ")
df['content_cleaned_2'] = df['content_cleaned_1'].str.lower()

In [7]:
punctuation_signs = list("?:!.,;…“”'’\"")
df['content_cleaned_3'] = df['content_cleaned_2']

for punct_sign in punctuation_signs:
    df['content_cleaned_3'] = df['content_cleaned_3'].str.replace(punct_sign, '')

In [8]:
df['content_cleaned_4'] = df['content_cleaned_3'].str.replace("'s", "")

### Lemmatization

In [9]:
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

In [10]:
nrows = len(df)
lemmatized_text_list = []

for row in range(nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['content_cleaned_4']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [11]:
df['content_cleaned_5'] = lemmatized_text_list

In [12]:
%%capture
# Downloading the stop words list
nltk.download('stopwords')

In [13]:
# Loading the stop words in english
stop_words = list(stopwords.words('english')) + list(map(lambda x: x.split(' ')[0].lower(), df['publication'].unique()))

In [14]:
df['content_cleaned_6'] = df['content_cleaned_5']
for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df['content_cleaned_6'] = df['content_cleaned_6'].str.replace(regex_stopword, '')

In [15]:
df['content_cleaned_7'] = df['content_cleaned_6']
df['content_cleaned_7'] = df['content_cleaned_7'].apply(lambda x: re.sub("\s\s+", " ", x))

In [16]:
# for i in range(1,7):
#     print(i, df['Content_Parsed_{}'.format(i)].iloc[5], '\n')

In [17]:
df['content_cleaned_7'].iloc[5]

'pat caddell fuse light political transformation - ask republican presidential candidates sen ted cruz donald trump consider transactional transformational candidates today news daily sirius xm veteran pollster pat caddell tell host stephen k bannon cruzs trump election would shock system [noting trump still way poll add caddell ive say revolutionary period think evidence clear public fact match light public move motion dont know theyre go go ask mean fuse light caddell elaborate see transformation paradigm politics old rule fail old assumptions dont work new set assumptions evolve public longer seem want eat dog food establishment dish fact go choices outside box ordinary politics caddell point conventional wisdom nine months ago jeb bush hillary clinton would prevail respective primaries country say want something different want take back importantly reclaim sovereignty say add people seem go passive observers active participants caddell say believe many people usually didnt vote com

In [18]:
df = df.rename(columns={'content_cleaned_7': 'content_cleaned'})

In [19]:
list_columns = ["publication", "content_full", "content_cleaned"]
df = df[list_columns]
df.head()

Unnamed: 0,publication,content_full,content_cleaned
0,Breitbart,GOP House Majority Whip Scalise on Forgoing Au...,gop house majority whip scalise forgo august r...
1,Breitbart,Megan Rapinoe Stands for Thailand Anthem Befor...,megan rapinoe stand thailand anthem kneel star...
2,Breitbart,Trump Administration Solicits Border Wall Prop...,trump administration solicit border wall propo...
3,Breitbart,Mark Levin: Trump the Globalist - Breitbart Ma...,mark levin trump globalist - mark levin write ...
4,Breitbart,"Laptop with Trump Tower Floor Plans, National ...",laptop trump tower floor plan national securit...


### Label encoding

In [20]:
df.publication.unique()

array(['Breitbart', 'Buzzfeed News', 'CNN', 'Fox News', 'Guardian', 'NPR'],
      dtype=object)

In [21]:
category_codes = {
    'Breitbart': 0,
    'Buzzfeed News': 1,
    'CNN': 2,
    'Fox News': 3, 
    'Guardian': 4, 
    'NPR': 5
}

In [22]:
# Category mapping
df['Category_Code'] = df['publication']
df = df.replace({'Category_Code':category_codes})

In [23]:
df.head()

Unnamed: 0,publication,content_full,content_cleaned,Category_Code
0,Breitbart,GOP House Majority Whip Scalise on Forgoing Au...,gop house majority whip scalise forgo august r...,0
1,Breitbart,Megan Rapinoe Stands for Thailand Anthem Befor...,megan rapinoe stand thailand anthem kneel star...,0
2,Breitbart,Trump Administration Solicits Border Wall Prop...,trump administration solicit border wall propo...,0
3,Breitbart,Mark Levin: Trump the Globalist - Breitbart Ma...,mark levin trump globalist - mark levin write ...,0
4,Breitbart,"Laptop with Trump Tower Floor Plans, National ...",laptop trump tower floor plan national securit...,0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(df['content_cleaned'], 
                                                    df['Category_Code'], 
                                                    test_size=0.15, 
                                                    random_state=8)

In [25]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [26]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(5100, 300)
(900, 300)


In [27]:
import numpy as np

for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'Breitbart' category:
  . Most correlated unigrams:
. 2016
. percent
. hillary
. follow
. twitter
  . Most correlated bigrams:
. donald trump
. hillary clinton

# 'Buzzfeed News' category:
  . Most correlated unigrams:
. clinton
. statement
. percent
. company
. news
  . Most correlated bigrams:
. donald trump
. hillary clinton

# 'CNN' category:
  . Most correlated unigrams:
. associate
. washington
. company
. news
. percent
  . Most correlated bigrams:
. unite state
. white house

# 'Fox News' category:
  . Most correlated unigrams:
. democratic
. associate
. cruz
. clinton
. latest
  . Most correlated bigrams:
. donald trump
. hillary clinton

# 'Guardian' category:
  . Most correlated unigrams:
. clinton
. game
. percent
. film
. us
  . Most correlated bigrams:
. donald trump
. hillary clinton

# 'NPR' category:
  . Most correlated unigrams:
. news
. clinton
. twitter
. thats
. percent
  . Most correlated bigrams:
. hillary clinton
. donald trump



In [28]:
print(features_train.shape)
print(features_test.shape)

(5100, 300)
(900, 300)


In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import seaborn as sns
# from matplotlib import pyplot as plt
# from scipy import stats
# from scipy.stats import mode
# from scipy.stats import norm, skew #for some statistics

# from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import RobustScaler
# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
# from sklearn.model_selection import KFold, cross_val_score, train_test_split
# from sklearn.metrics import mean_squared_error

#ensembles
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pd.options.display.max_rows=999
pd.options.display.max_columns
# from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
# from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score
# , train_test_split
# from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter('ignore')


In [30]:
num_folds = 5
seed = 7
# scoring = 'accuracy'
scoring = 'accuracy'
# Standardize the dataset
pipelines = []
pipelines.append(('AB', AdaBoostClassifier()))
# pipelines.append(('ET', ExtraTreesClassifier()))
pipelines.append(('LDA', LinearDiscriminantAnalysis()))
pipelines.append(('XGB', XGBClassifier()))
pipelines.append(('ScaledLR', 
                  Pipeline([('Scaler', StandardScaler()),
                            ('LR',LogisticRegression())])))
pipelines.append(('ScaledLDA', 
                  Pipeline([('Scaler', StandardScaler()),
                            ('LDA', LinearDiscriminantAnalysis())])))
# pipelines.append(('ScaledKNN', 
#                   Pipeline([('Scaler', StandardScaler()),
#                             ('KNN', KNeighborsClassifier())])))

pipelines.append(('RobustScaledGBM', 
                  Pipeline([('RobustScaler', RobustScaler()),
                            ('GBM', GradientBoostingClassifier())])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model, features_train, labels_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

AB: 0.429020 (0.006031)
LDA: 0.489020 (0.005764)
XGB: 0.498431 (0.009971)
ScaledLR: 0.485098 (0.010977)
ScaledLDA: 0.489020 (0.005764)
RobustScaledGBM: 0.489412 (0.008070)


In [31]:
# num_folds = 5
# seed = 7
# # scoring = 'accuracy'
# scoring = 'f1_macro'
# # Standardize the dataset
# pipelines = []
# pipelines.append(('AB', AdaBoostClassifier()))
# # pipelines.append(('ET', ExtraTreesClassifier()))
# pipelines.append(('LDA', LinearDiscriminantAnalysis()))
# pipelines.append(('XGB', XGBClassifier()))
# pipelines.append(('ScaledLR', 
#                   Pipeline([('Scaler', StandardScaler()),
#                             ('LR',LogisticRegression())])))
# pipelines.append(('ScaledLDA', 
#                   Pipeline([('Scaler', StandardScaler()),
#                             ('LDA', LinearDiscriminantAnalysis())])))
# # pipelines.append(('ScaledKNN', 
# #                   Pipeline([('Scaler', StandardScaler()),
# #                             ('KNN', KNeighborsClassifier())])))

# pipelines.append(('RobustScaledGBM', 
#                   Pipeline([('RobustScaler', RobustScaler()),
#                             ('GBM', GradientBoostingClassifier())])))

# results = []
# names = []
# for name, model in pipelines:
#     kfold = KFold(n_splits=num_folds, random_state=seed)
#     cv_results = cross_val_score(model, features_train, labels_train, cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
#     print(msg)

In [32]:
# num_folds = 5
# seed = 7
# # scoring = 'accuracy'
# scoring = 'precision'
# # Standardize the dataset
# pipelines = []
# pipelines.append(('AB', AdaBoostClassifier()))
# # pipelines.append(('ET', ExtraTreesClassifier()))
# pipelines.append(('LDA', LinearDiscriminantAnalysis()))
# pipelines.append(('XGB', XGBClassifier()))
# pipelines.append(('ScaledLR', 
#                   Pipeline([('Scaler', StandardScaler()),
#                             ('LR',LogisticRegression())])))
# pipelines.append(('ScaledLDA', 
#                   Pipeline([('Scaler', StandardScaler()),
#                             ('LDA', LinearDiscriminantAnalysis())])))
# # pipelines.append(('ScaledKNN', 
# #                   Pipeline([('Scaler', StandardScaler()),
# #                             ('KNN', KNeighborsClassifier())])))

# pipelines.append(('RobustScaledGBM', 
#                   Pipeline([('RobustScaler', RobustScaler()),
#                             ('GBM', GradientBoostingClassifier())])))

# results = []
# names = []
# for name, model in pipelines:
#     kfold = KFold(n_splits=num_folds, random_state=seed)
#     cv_results = cross_val_score(model, features_train, labels_train, cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
#     print(msg)

In [33]:
# num_folds = 5
# seed = 7
# # scoring = 'accuracy'
# scoring = 'recall'
# # Standardize the dataset
# pipelines = []
# pipelines.append(('AB', AdaBoostClassifier()))
# # pipelines.append(('ET', ExtraTreesClassifier()))
# pipelines.append(('LDA', LinearDiscriminantAnalysis()))
# pipelines.append(('XGB', XGBClassifier()))
# pipelines.append(('ScaledLR', 
#                   Pipeline([('Scaler', StandardScaler()),
#                             ('LR',LogisticRegression())])))
# pipelines.append(('ScaledLDA', 
#                   Pipeline([('Scaler', StandardScaler()),
#                             ('LDA', LinearDiscriminantAnalysis())])))
# # pipelines.append(('ScaledKNN', 
# #                   Pipeline([('Scaler', StandardScaler()),
# #                             ('KNN', KNeighborsClassifier())])))

# pipelines.append(('RobustScaledGBM', 
#                   Pipeline([('RobustScaler', RobustScaler()),
#                             ('GBM', GradientBoostingClassifier())])))

# results = []
# names = []
# for name, model in pipelines:
#     kfold = KFold(n_splits=num_folds, random_state=seed)
#     cv_results = cross_val_score(model, features_train, labels_train, cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
#     print(msg)

# Notes:
Be sure to add pub name to stop words

In [34]:
print('doddo')

doddo
