In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api

In [2]:
wv = api.load("word2vec-google-news-300")

In [3]:
df = pd.read_csv('data.csv', encoding='unicode_escape')
df.shape

(2225, 3)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   category    2225 non-null   object 
 1   text        2225 non-null   object 
 2   Unnamed: 2  0 non-null      float64
dtypes: float64(1), object(2)
memory usage: 52.3+ KB


In [5]:
df.category.value_counts()

category
business         511
sport            510
politics         417
tech             400
entertainment    387
Name: count, dtype: int64

In [6]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['category'])
df.head()

Unnamed: 0,category,text,Unnamed: 2,label
0,tech,tv future in the hands of viewers with home th...,,4
1,business,worldcom boss left books alone former worldc...,,0
2,sport,tigers wary of farrell gamble leicester say ...,,3
3,sport,yeading face newcastle in fa cup premiership s...,,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,,1


In [7]:
df.drop('Unnamed: 2', axis=1, inplace=True)
df.head()

Unnamed: 0,category,text,label
0,tech,tv future in the hands of viewers with home th...,4
1,business,worldcom boss left books alone former worldc...,0
2,sport,tigers wary of farrell gamble leicester say ...,3
3,sport,yeading face newcastle in fa cup premiership s...,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,1


In [8]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     -------------------------------------- 0.0/587.7 MB 165.2 kB/s eta 0:59:19
     -------------------------------------- 0.0/587.7 MB 245.8 kB/s eta 0:39:52
     -------------------------------------- 0.1/587.7 MB 357.2 kB/s eta 0:27:26
     -------------------------------------- 0.1/587.7 MB 516.7 kB/s eta 0:18:58
     -------------------------------------- 0.2/587.7 MB 734.2 kB/s eta 0:13:21
     -------------------------------------- 0.3/587.7 MB 811.7 kB/s eta 0:12:04
     -------------------------------------- 0.3/587.7 MB 873.8 kB/s eta 0:11:13
     -------------------------------------- 0.4/587.7 MB 919.0 kB/s eta 0:10:40
     --------------------------------

In [9]:
# Creating a function to preprocess the text column

import spacy
nlp = spacy.load("en_core_web_lg")


def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for word in doc:
        if word.is_punct or word.is_stop:
            continue
        filtered_tokens.append(word.lemma_)
    return wv.get_mean_vector(filtered_tokens)

In [10]:
df['clean_text'] = df['text'].apply(lambda text: preprocess(text))
df.head()

Unnamed: 0,category,text,label,clean_text
0,tech,tv future in the hands of viewers with home th...,4,"[0.0029764557, 0.011629638, 0.00090882956, 0.0..."
1,business,worldcom boss left books alone former worldc...,0,"[-0.009847597, 0.0068147634, -0.0046906685, 0...."
2,sport,tigers wary of farrell gamble leicester say ...,3,"[0.00036933756, 0.014444627, 0.0017897796, 0.0..."
3,sport,yeading face newcastle in fa cup premiership s...,3,"[-0.008516042, 0.008820114, 0.026090756, 0.022..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,1,"[-0.005286897, 0.012089302, -0.006101363, 0.03..."


In [37]:
df['clean_text'].shape

(2225,)

In [38]:
X = df['clean_text']
y = df['label']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
X_train[0]

array([ 0.00297646,  0.01162964,  0.00090883,  0.03457899, -0.01670234,
        0.00366297,  0.02506217, -0.03020656,  0.03771606,  0.01510863,
       -0.02718888, -0.03845195, -0.01098833, -0.0087716 , -0.03560304,
        0.03596859,  0.02222952,  0.03662806,  0.00805568, -0.0320592 ,
       -0.00226619,  0.01480762,  0.00958186,  0.02251823,  0.00636224,
        0.0041969 , -0.02619888,  0.04043696,  0.01810201, -0.0153979 ,
       -0.01037938,  0.00210234, -0.02929954, -0.01200871, -0.00091561,
       -0.02022207,  0.00284284,  0.00197389,  0.01917141,  0.01227514,
        0.01428427, -0.00846291,  0.04847049,  0.01288042, -0.01905905,
       -0.026159  , -0.00708836,  0.00792605, -0.0078293 ,  0.00795486,
       -0.00532901,  0.00085274,  0.00626599, -0.00909321,  0.00697808,
       -0.0006141 , -0.02940313, -0.01975465,  0.01694717, -0.01704587,
       -0.00584803,  0.00680073, -0.05116668, -0.002624  , -0.00838106,
        0.00468009, -0.03115963,  0.03822583,  0.00276982,  0.02

In [42]:
X_train = np.stack(X_train)
X_test = np.stack(X_test)

In [43]:
X_train[0]

array([ 0.00193428,  0.02061828, -0.00980333,  0.02122835,  0.00313915,
        0.00432369,  0.00158204, -0.03766968,  0.0306087 ,  0.02435788,
       -0.00310426, -0.03952055, -0.01238149,  0.00942886, -0.04347974,
        0.04493241,  0.014057  ,  0.02670501,  0.00011738, -0.02029575,
        0.00314207,  0.02752284,  0.01073541, -0.00421053,  0.00040784,
       -0.01043998, -0.02150945,  0.03342304,  0.01506129, -0.00202068,
       -0.02249594, -0.00601889, -0.02474676, -0.00561612,  0.00018046,
       -0.00742169,  0.02313323,  0.02980158,  0.01829236,  0.03066635,
        0.02148492, -0.03499369,  0.04552441,  0.02955964, -0.00186307,
       -0.0190217 , -0.01597471,  0.00228312,  0.02393671,  0.01323395,
       -0.01838345,  0.00691611, -0.00579188, -0.00076139, -0.00192454,
       -0.02695186, -0.01644375, -0.02200504, -0.01278811, -0.01935249,
       -0.00933981,  0.03418165, -0.03129874, -0.00389385, -0.00193464,
        0.01782906, -0.01460885,  0.02149636,  0.00304391,  0.03

In [44]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

In [45]:
y_pred = model.predict(X_test)

In [46]:
print(accuracy_score(y_test, y_pred))

0.9325842696629213


In [47]:
def outcome(text):
    preprocessed_text = preprocess(text)
    stacked_text = np.stack([preprocessed_text])
    result = model.predict(stacked_text)
    if result == 0:
        print("Business")
    elif result == 1:
        print("Entertainment")
    elif result == 2:
        print("Politics")
    elif result == 3:
        print("Sport")
    else:
        print("Tech")

In [50]:
outcome("Worried about war and its impact on Iran's already flailing economy, a significant proportion of Iranians oppose what they see as the reckless adventurism of the country's elite Islamic Revolutionary Guards Corps (IRGC), which fired more than 300 drones and missiles on Saturday night.")

Politics


In [49]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))