## Preprocessing with scikit learn

In [1]:
import os
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Loading file from path
def loading_file():
    file_dir = '/home/nbuser/library/1. Classifier/3. Exploratory Data Analysis'        
    file_list = glob.glob(file_dir + '/*.csv')
    csv_file = file_list[1]
    return csv_file

# Import file imto Pandas DataFrame
def importing_file(csv_file):
    df = pd.read_csv(csv_file, sep=",")
    return df

# Saving path
def saving_file(file, file_name, save_dir):
    file.to_csv(os.path.join(save_dir,file_name))


In [3]:
# Importing file + Loading  file
news_df = importing_file(loading_file())

# Top 5 records
news_df.head()

Unnamed: 0,file_name,title,news_text,category
0,164.txt,Parker's saxophone heads auction,A saxophone belonging to legendary jazz musici...,entertainment
1,012.txt,Edwards tips Idowu for Euro gold,World outdoor triple jump record holder and BB...,sports
2,257.txt,Blair and Brown criticised by MPs,Labour MPs have angrily criticised Tony Blair ...,politics
3,238.txt,Firms pump billions into pensions,Employers have spent billions of pounds proppi...,business
4,400.txt,Monsanto fined $1.5m for bribery,The US agrochemical giant Monsanto has agreed ...,business


In [4]:
news_df.shape

(2003, 4)

In [5]:
news_df.category.unique()

array(['entertainment', 'sports', 'politics', 'business', 'tech'],
      dtype=object)

In [59]:
news_df.sample(5)

Unnamed: 0,file_name,title,news_text,category
1602,258.txt,S Korea spending boost to economy,South Korea will boost state spending next yea...,business
833,256.txt,Singapore growth at 8.1% in 2004,Singapores economy grew by 81 in 2004 its best...,business
1776,413.txt,EC calls truce in deficit battle,The European Commission EC has called a truce ...,business
1528,176.txt,Councils 'must find Gypsy sites',Ministers are telling councils to find more si...,politics
1703,402.txt,Brown's poll campaign move denied,The government has denied reports that Gordon ...,politics


In [24]:
news_df.category.value_counts()

business         466
sports           457
politics         368
tech             364
entertainment    348
Name: category, dtype: int64

### Clean the news_text column

In [94]:
import re
def clean_text(x):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', str(x))
    return text

def clean_all_text(df):
    for index, item in df.iterrows():
        cleantext = clean_text(item['news_text'])
        item['news_text'] = cleantext
    return df

In [95]:
news_df = clean_all_text(news_df)

### Split the dataset

In [9]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Create a series to store the labels: y
y = news_df["category"]

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(news_df["news_text"], y, test_size=0.3, random_state=53)


### CountVectorizer for text classification

In [10]:
# Import the necessary modules
from sklearn.feature_extraction.text import CountVectorizer

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english")

# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)

            
# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:100])


['00', '000', '00051', '001', '003', '004secs', '007', '01', '0100', '0130', '019secs', '02', '0200', '022', '0227', '024', '025', '027', '028', '03', '030', '0300', '0305', '04', '040', '041', '048', '05', '053', '053bn', '05overall', '06', '0605', '0605festival', '0619', '07', '0700', '0710', '08', '0800', '083mph', '085', '0870', '089', '09', '090', '0900', '098', '099', '10', '100', '1000', '10000', '100000', '10000m', '10000vote', '1000bn', '1000m', '1000s', '1000th', '1001st', '1008', '100bn', '100m', '100mthey', '100s', '101', '101115', '10137', '1015', '1019', '101yearold', '102', '1020', '10216bn', '1022', '10227', '1025', '1026bn', '10276', '10280', '1028bn', '102inch', '102m', '103', '1030', '10360', '1038548recent', '1038am', '103bn', '103m', '104', '1040', '104000', '10416', '1044', '10499', '104bn', '104m', '105']


#### TfidfVectorizer for text classification

In [11]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english",max_df=0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)
    
print(tfidf_vectorizer.get_feature_names()[:100])

['00', '000', '00051', '001', '003', '004secs', '007', '01', '0100', '0130', '019secs', '02', '0200', '022', '0227', '024', '025', '027', '028', '03', '030', '0300', '0305', '04', '040', '041', '048', '05', '053', '053bn', '05overall', '06', '0605', '0605festival', '0619', '07', '0700', '0710', '08', '0800', '083mph', '085', '0870', '089', '09', '090', '0900', '098', '099', '10', '100', '1000', '10000', '100000', '10000m', '10000vote', '1000bn', '1000m', '1000s', '1000th', '1001st', '1008', '100bn', '100m', '100mthey', '100s', '101', '101115', '10137', '1015', '1019', '101yearold', '102', '1020', '10216bn', '1022', '10227', '1025', '1026bn', '10276', '10280', '1028bn', '102inch', '102m', '103', '1030', '10360', '1038548recent', '1038am', '103bn', '103m', '104', '1040', '104000', '10416', '1044', '10499', '104bn', '104m', '105']


#### Show Vectors as Features

In [12]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# Print the head of count_df
print(count_df.head())
print()
# Print the head of tfidf_df
print(tfidf_df.head())

# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))


   00  000  00051  001  003  004secs  007  01  0100  0130     ...       zones  \
0   0    0      0    0    0        0    0   0     0     0     ...           0   
1   0    0      0    0    0        0    0   0     0     0     ...           0   
2   0    0      0    0    0        0    0   0     0     0     ...           0   
3   0    0      0    0    0        0    0   0     0     0     ...           0   
4   0    0      0    0    0        0    0   0     0     0     ...           0   

   zoom  zoomsmore  zorro  zubair  zuluaga  zurich  zutons  zvonareva  \
0     0          0      0       0        0       0       0          0   
1     0          0      0       0        0       0       0          0   
2     0          0      0       0        0       0       0          0   
3     0          0      0       0        0       0       0          0   
4     0          0      0       0        0       0       0          0   

   zvyagintsev  
0            0  
1            0  
2            0  
3     

#### Training and testing models with CountVectorizer

In [13]:
# Import the necessary modules
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['entertainment', 'politics', 'business', 'sports', 'tech'])
print(cm)


0.9717138103161398
[[102   2   0   0   1]
 [  0 101   0   0   0]
 [  0   8 134   0   4]
 [  0   0   0 143   0]
 [  0   1   0   1 104]]


#### Training and testing the model with TfidfVectorizer

In [14]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['entertainment', 'politics', 'business', 'sports', 'tech'])
print(cm)


0.9683860232945092
[[100   3   1   0   1]
 [  0 101   0   0   0]
 [  0   5 139   0   2]
 [  0   0   0 143   0]
 [  0   2   2   3  99]]


#### Improving your model


In [15]:
# Create the list of alphas: alphas
alphas = np.arange(0, 1, .1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
best_alfa_score = {}
for alpha in alphas:
    best_alfa_score[alpha] = train_and_predict(alpha)
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()
best_score = max(best_alfa_score, key=best_alfa_score.get)

print("Best alfa: ", best_score, "best accuracy: ",best_alfa_score[best_score])

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha:  0.0
Score:  0.9683860232945092

Alpha:  0.1
Score:  0.9750415973377704

Alpha:  0.2
Score:  0.9750415973377704

Alpha:  0.30000000000000004
Score:  0.9750415973377704

Alpha:  0.4
Score:  0.9717138103161398

Alpha:  0.5
Score:  0.9733777038269551

Alpha:  0.6000000000000001
Score:  0.9700499168053245

Alpha:  0.7000000000000001
Score:  0.9700499168053245

Alpha:  0.8
Score:  0.9683860232945092

Alpha:  0.9
Score:  0.9683860232945092

Best alfa:  0.1 best accuracy:  0.9750415973377704


In [16]:
# Create the list of alphas: alphas
alphas = np.arange(0, 1, .1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(count_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(count_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
best_alfa_score = {}
for alpha in alphas:
    best_alfa_score[alpha] = train_and_predict(alpha)
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()
best_score = max(best_alfa_score, key=best_alfa_score.get)

print("Best alfa: ", best_score, "best accuracy: ",best_alfa_score[best_score])

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha:  0.0
Score:  0.9717138103161398

Alpha:  0.1
Score:  0.9816971713810316

Alpha:  0.2
Score:  0.9800332778702163

Alpha:  0.30000000000000004
Score:  0.978369384359401

Alpha:  0.4
Score:  0.978369384359401

Alpha:  0.5
Score:  0.978369384359401

Alpha:  0.6000000000000001
Score:  0.9733777038269551

Alpha:  0.7000000000000001
Score:  0.9717138103161398

Alpha:  0.8
Score:  0.9717138103161398

Alpha:  0.9
Score:  0.9717138103161398

Best alfa:  0.1 best accuracy:  0.9816971713810316


#### Inspecting the model


In [18]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])


business [(-10.45661585751269, '00'), (-10.45661585751269, '000'), (-10.45661585751269, '0001'), (-10.45661585751269, '00051'), (-10.45661585751269, '002'), (-10.45661585751269, '004secs'), (-10.45661585751269, '007'), (-10.45661585751269, '0100'), (-10.45661585751269, '0130'), (-10.45661585751269, '019secs'), (-10.45661585751269, '0227'), (-10.45661585751269, '028'), (-10.45661585751269, '0305'), (-10.45661585751269, '040'), (-10.45661585751269, '0469'), (-10.45661585751269, '053bn'), (-10.45661585751269, '0800'), (-10.45661585751269, '0870'), (-10.45661585751269, '090'), (-10.45661585751269, '1000000000')]
entertainment [(-8.613260493776794, 'stock'), (-8.60427307626571, 'new'), (-8.577506079896372, 'firms'), (-8.57157225931154, 'rise'), (-8.566396008929635, 'government'), (-8.546549679980423, 'yukos'), (-8.370228484903935, '2004'), (-8.365628689869363, 'prices'), (-8.352785134212938, 'firm'), (-8.311158085240452, 'economic'), (-8.297926644109499, 'shares'), (-8.282300993198296, 'mr'

### Test on new dataset

In [57]:
# Import new data
os.chdir('/home/nbuser/library/1. Classifier/3. Exploratory Data Analysis')
new_data = pd.read_csv('news_df_validation.csv')
new_data.head()

Unnamed: 0,file_name,title,news_text,category
0,214.txt,Mansfield 0-1 Leyton Orient,An second-half goal from Andy Scott condemned ...,sports
1,303.txt,Film production 'falls' 40% in UK,The number of British films produced in the UK...,entertainment
2,083.txt,Hague 'given up' his PM ambition,Former Conservative leader William Hague says ...,politics
3,190.txt,SA return to Mauritius,Top seeds South Africa return to the scene of ...,sports
4,103.txt,Minimum rate for foster parents,Foster carers are to be guaranteed a minimum a...,politics


#### Count Vectorizer on test dataset

In [65]:
def new_data_count_vec_predict(test_new_data):
    clean_df = clean_all_text(test_new_data)
    nb_classifier = MultinomialNB()
    nb_classifier.fit(count_train, y_train)
    count_test = count_vectorizer.transform(test_new_data['news_text'])
    new_cv_pred = nb_classifier.predict(count_test)
    return new_cv_pred

In [66]:
def print_new_data_pred(test_data, index):
    new_data_cat = new_data['category']
    print(new_data_count_vec_predict(test_data)[index],' == ', new_data_cat[index])

In [67]:
print_new_data_pred(new_data, 28)

entertainment  ==  entertainment


In [69]:
count_vec_new_pred = new_data_count_vec_predict(new_data)

#### TFIDF on new dataset test

In [36]:
def new_data_ftidf_vec_predict(test_new_data):
    clean_df = clean_all_text(test_new_data)
    nb_classifier = MultinomialNB()
    nb_classifier.fit(tfidf_train, y_train)
    ftidf_test = tfidf_vectorizer.transform(test_new_data['news_text'])
    new_tfidf_pred = nb_classifier.predict(ftidf_test)
    return new_tfidf_pred

In [45]:
tfidf_new_pred = new_data_ftidf_vec_predict(new_data)

In [44]:
print(tfidf_new_pred[112])

sports


### End the scooooore iiiiiiiis: 

In [55]:
count_vec_score = metrics.accuracy_score(count_vec_new_pred, new_data['category'])
print('Count vectorize score: ',round(count_vec_score, 4))
print()
tfidf_vec_score = metrics.accuracy_score(new_data_ftidf_vec_predict(new_data), new_data['category'])
print('TF-IDF score: ',round(tfidf_vec_score, 4))

Count vectorize score:  0.9775

TF-IDF score:  0.973


### CV: 0.9775 <br><br>TFIDF: 0.973

### Saving models to pickle file

In [73]:
#Import pickle
import pickle

In [80]:
# save count_vector model to disk
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

os.chdir('/home/nbuser/library/1. Classifier/4. Feature Engineering')
filename = 'count_vec_model.sav'
pickle.dump(nb_classifier, open(filename, 'wb'))
 
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))

In [81]:
# save tf-idf_vector model to disk
nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_train, y_train)

filename = 'tfidf_vec_model.sav'
pickle.dump(nb_classifier, open(filename, 'wb'))
 
 

### Test on BBC live news

In [82]:
!pip install newsapi-python

Collecting newsapi-python
Collecting requests==2.21.0 (from newsapi-python)
  Using cached https://files.pythonhosted.org/packages/7d/e3/20f3d364d6c8e5d2353c72a67778eb189176f08e873c9900e10c0287b84b/requests-2.21.0-py2.py3-none-any.whl
Collecting chardet<3.1.0,>=3.0.2 (from requests==2.21.0->newsapi-python)
  Using cached https://files.pythonhosted.org/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl
Collecting certifi>=2017.4.17 (from requests==2.21.0->newsapi-python)
[?25l  Downloading https://files.pythonhosted.org/packages/18/b0/8146a4f8dd402f60744fa380bc73ca47303cccf8b9190fd16a827281eac2/certifi-2019.9.11-py2.py3-none-any.whl (154kB)
[K     |████████████████████████████████| 163kB 1.3MB/s eta 0:00:01
[?25hCollecting urllib3<1.25,>=1.21.1 (from requests==2.21.0->newsapi-python)
[?25l  Downloading https://files.pythonhosted.org/packages/01/11/525b02e4acc0c747de8b6ccdab376331597c569c42ea66ab0a1dbd36eca2/urllib3-1.24.3-p

In [83]:
from newsapi import NewsApiClient
newsapi = NewsApiClient(api_key='61f57a8c2e014016a21c6c56828d455c')

In [84]:
# /v2/everything
all_articles = newsapi.get_everything(sources='bbc-news', page_size=100)

In [91]:
BBC_all_news_df = pd.DataFrame.from_dict(all_articles['articles'])
BBC_all_news_df['news_text'] = BBC_all_news_df['content']
#BBC_all_news_df = BBC_all_news_df.drop(['author', 'description', 'content', 'publishedAt', 'source', 'title', 'url', 'urlToImage'], axis=1)
list(BBC_all_news_df.columns)

['author',
 'content',
 'description',
 'publishedAt',
 'source',
 'title',
 'url',
 'urlToImage',
 'news_text']

In [87]:
pd.set_option('display.max_colwidth', -1)

In [88]:
BBC_all_news_df

Unnamed: 0,news_text
0,"Image copyrightGoogle\r\nStaff have been threatened with a meat cleaver during a bank robbery in Fife. \r\nPolice said the incident happened just after 09:00 at the Bank of Scotland in Bothwell Street, Dunfermline. \r\nA man threatened workers with the weapon and d… [+192 chars]"
1,"Check the latest opinion poll trends with the poll tracker, which measures how people say they are going to vote at the next general election.\r\nAnalysis by BBC senior political analyst Peter Barnes\r\nAs our poll tracker graph shows, after a year and a half of … [+10394 chars]"
2,"Image copyrightBBC NewsImage caption\r\n Thomas Cook passenger Mary Nicholls (right), on holiday with her grandson Matt Walker, fears running out of her heart condition medication if she is stranded in Cyprus.\r\nThomas Cook customers have told the BBC of their f… [+3597 chars]"
3,"Jonny Bairstow has been left out of England's squad for the Tests in New Zealand, while uncapped quartet Matthew Parkinson, Dom Sibley, Zak Crawley and Saqib Mahmood have been called up.\r\nWicketkeeper Bairstow averages 20.25 in Tests this year, so Jos Buttler… [+756 chars]"
4,Portuguese coach Toni Conceicao and Francois Omam-Biyik have been given the task of leading Cameroon to the Africa Cup of Nations title in 2021\r\nCameroon's new coaching team of Portugal's Toni Conceicao and former Indomitable Lion player Francois Omam-Biyik a… [+2198 chars]
5,"Cory Hill last played in Wales' 2019 Six Nations victory over England in February\r\nWales second row Cory Hill could ""possibly"" be out of the World Cup without playing a game, says coach Warren Gatland.\r\nWales had only two fit locks - Alun Wyn Jones and Jake B… [+1434 chars]"
6,Image copyrightAFPImage caption\r\n Emiliano Sala had just signed with Cardiff City before the plane he was travelling in crashed into the English Channel on 21 January\r\nTwo people have been jailed after admitting accessing CCTV footage of the post-mortem exami… [+345 chars]
7,Image copyrightGetty ImagesImage caption\r\n A court sketch during the trial on Monday of women accused of plotting a bomb attack in Paris\r\nFive French women have gone on trial in Paris accused of trying to detonate a car bomb near the iconic Notre-Dame cathedr… [+2902 chars]
8,"British number one Kyle Edmund is looking for another coach after parting company with Mark Hilton.\r\nEdmund lost to Chilean Cristian Garin in the Chengdu Open first round on Monday, his fourth straight defeat.\r\nHowever, Hilton was not in China for the match a… [+861 chars]"
9,"The EU has approved for the first time the use of a medicinal cannabis product aimed at patients with two rare, but severe, forms of childhood epilepsy.\r\nDoctors can prescribe Epidyolex - an oral solution of cannabidiol, which comes from the cannabis plant - … [+2473 chars]"


In [96]:
count_vec_BBC_live_pred = new_data_count_vec_predict(BBC_all_news_df)

In [97]:
print(count_vec_BBC_live_pred)

['politics' 'politics' 'entertainment' 'sports' 'sports' 'sports'
 'entertainment' 'entertainment' 'sports' 'business' 'tech' 'sports'
 'entertainment' 'tech' 'politics' 'sports' 'politics' 'entertainment'
 'business' 'politics' 'sports' 'politics' 'sports' 'sports'
 'entertainment' 'sports' 'business' 'tech' 'entertainment' 'sports'
 'entertainment' 'politics' 'entertainment' 'business' 'sports' 'business'
 'sports' 'business' 'sports' 'sports' 'business' 'politics' 'sports'
 'tech' 'politics' 'sports' 'tech' 'tech' 'sports' 'sports' 'tech'
 'entertainment' 'sports' 'politics' 'sports' 'sports' 'entertainment'
 'entertainment' 'tech' 'tech' 'sports' 'tech' 'sports' 'entertainment'
 'tech' 'sports' 'sports' 'business' 'politics' 'sports' 'business'
 'politics' 'sports' 'business' 'politics' 'politics' 'sports' 'sports'
 'tech' 'sports' 'sports' 'sports' 'business' 'sports' 'politics'
 'entertainment' 'tech' 'politics' 'politics' 'sports' 'sports' 'politics'
 'tech' 'sports' 'entertainm

In [98]:
tfidf_BBC_live_pred = new_data_ftidf_vec_predict(BBC_all_news_df)

In [99]:
print(tfidf_BBC_live_pred)

['business' 'politics' 'sports' 'sports' 'sports' 'sports' 'sports'
 'politics' 'sports' 'business' 'tech' 'sports' 'sports' 'tech' 'politics'
 'sports' 'politics' 'sports' 'business' 'politics' 'sports' 'politics'
 'sports' 'sports' 'sports' 'sports' 'business' 'tech' 'politics' 'sports'
 'entertainment' 'politics' 'business' 'business' 'sports' 'business'
 'sports' 'business' 'sports' 'sports' 'business' 'politics' 'sports'
 'tech' 'business' 'sports' 'tech' 'tech' 'sports' 'sports' 'tech'
 'entertainment' 'sports' 'politics' 'sports' 'sports' 'entertainment'
 'sports' 'business' 'business' 'sports' 'tech' 'sports' 'entertainment'
 'tech' 'sports' 'sports' 'business' 'sports' 'sports' 'business' 'sports'
 'sports' 'business' 'politics' 'politics' 'sports' 'sports' 'tech'
 'sports' 'sports' 'sports' 'sports' 'sports' 'politics' 'entertainment'
 'tech' 'politics' 'politics' 'sports' 'sports' 'politics' 'tech' 'sports'
 'sports' 'business' 'sports' 'entertainment' 'politics' 'tech']
