Load model

In [None]:

import nltk
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import joblib

In [19]:
## Read Data

import pandas as pd

data = pd.read_csv("test_data_no_labels.csv",encoding='latin-1')

print(data.shape)

print(data.head())

(4489, 4)
                                               title  \
0  Syria toxic gas inquiry to end after Russia ag...   
1  Greeks march to mark 1973 student revolt again...   
2  China says will work with North Korea to boost...   
3  Argentina intensifies search for missing subma...   
4  MUST READ: Iâm Still Trying To Figure Out Ho...   

                                                text    subject        date  
0  UNITED NATIONS (Reuters) - An international in...  worldnews  2017-11-17  
1  ATHENS (Reuters) - Greek police clashed with h...  worldnews  2017-11-17  
2  BEIJING/SEOUL (Reuters) - Traditional friendsh...  worldnews  2017-11-17  
3  BUENOS AIRES (Reuters) - An Argentine submarin...  worldnews  2017-11-17  
4  Once you ve read this list, you re going to wa...  left-news  2017-11-17  


# Preprocessing

In [38]:
from nltk.tokenize import word_tokenize

# Tokenize 'title' and 'text' columns and store as new columns
data['title'] = data['title'].apply(lambda x: word_tokenize(str(x)))
data['text'] = data['text'].apply(lambda x: word_tokenize(str(x)))

def clean_text(text: str) -> str:
    # Ensure we are working with a string
    text = str(text)

    # Remove all special characters (keep only letters, numbers, and spaces)
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)

    # Remove all single characters (like "a", "b", "c" standing alone)
    text = re.sub(r"\b[A-Za-z]\b", "", text)

    # Remove single characters from the start of the text
    text = re.sub(r"^[A-Za-z]\s+", "", text)

    # Replace multiple spaces with a single space
    text = re.sub(r"\s+", " ", text)

    # Convert everything to lowercase
    text = text.lower()

    return text


# clean text
data['text'] = data['text'].apply(clean_text)

# clean title
data['title'] = data['title'].apply(clean_text)

data.head() 



Unnamed: 0,title,text,subject,date
0,syria toxic gas inquiry to end after russia ag...,united nations reuters an international invest...,worldnews,2017-11-17
1,greeks march to mark 1973 student revolt again...,athens reuters greek police clashed with hoode...,worldnews,2017-11-17
2,china says will work with north korea to boost...,beijingseoul reuters traditional friendship be...,worldnews,2017-11-17
3,argentina intensifies search for missing subma...,buenos aires reuters an argentine submarine wi...,worldnews,2017-11-17
4,must read ix80x99m still trying to figure out ...,once you ve read this list you re going to wan...,left-news,2017-11-17


In [39]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(str(text))
    filtered = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
    return ' '.join(filtered)

# Make a copy to preserve the original data
data_nostop = data.copy()

# Replace the columns with stopword-removed text
data_nostop['title'] = data_nostop['title'].apply(remove_stopwords)
data_nostop['text'] = data_nostop['text'].apply(remove_stopwords)

data_nostop.head()


Unnamed: 0,title,text,subject,date
0,syria toxic gas inquiry end russia blocks un r...,united nations reuters international investiga...,worldnews,2017-11-17
1,greeks march mark student revolt junta clashes...,athens reuters greek police clashed hooded you...,worldnews,2017-11-17
2,china says work north korea boost ties envoy v...,beijingseoul reuters traditional friendship ch...,worldnews,2017-11-17
3,argentina intensifies search missing submarine...,buenos aires reuters argentine submarine crew ...,worldnews,2017-11-17
4,must read still trying figure hillary lost,read list going want share everyone know russi...,left-news,2017-11-17


In [40]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_text(text):
    tokens = word_tokenize(str(text))
    return ' '.join([stemmer.stem(word) for word in tokens])

def lemmatize_text(text):
    tokens = word_tokenize(str(text))
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])

# stem the data
data_norm = data_nostop.copy()

data_norm['title'] = data_norm['title'].apply(stem_text)
data_norm['text'] = data_norm['text'].apply(stem_text)

data_norm.head()

# lemmatize the data
data_norm['title'] = data_norm['title'].apply(lemmatize_text)
data_norm['text'] = data_norm['text'].apply(lemmatize_text)

data_norm.head()   


Unnamed: 0,title,text,subject,date
0,syria toxic ga inquiri end russia block un renew,unit nation reuter intern investig blame chemi...,worldnews,2017-11-17
1,greek march mark student revolt junta clash break,athen reuter greek polic clash hood youth athe...,worldnews,2017-11-17
2,china say work north korea boost tie envoy visit,beijingseoul reuter tradit friendship china no...,worldnews,2017-11-17
3,argentina intensifi search miss submarin crew,bueno air reuter argentin submarin crew board ...,worldnews,2017-11-17
4,must read still tri figur hillari lost,read list go want share everyon know russian u...,left-news,2017-11-17


In [52]:

# prepare new data
new_df = data.copy()
new_df["date"] = pd.to_datetime(new_df["date"])
new_df = new_df.sort_values("date").reset_index(drop=True)

# drop raw date
X_new = new_df.drop(columns=["date"]) if "date" in new_df.columns else new_df

X_new.head()



Unnamed: 0,title,text,subject
0,syria toxic gas inquiry to end after russia ag...,united nations reuters an international invest...,worldnews
1,lebanon hariri on twitter am on the way to the...,cairo reuters lebanon prime minister saad alha...,worldnews
2,factbox pot nation canada plans for legal mari...,reuters recreational marijuana is on track to ...,worldnews
3,lebanese sunni politician warns of arab sancti...,beirut reuters lebanon could face economic san...,worldnews
4,russia blocks bid to briefly extend syria chem...,united nations reuters russia vetoed on friday...,worldnews


## tf_idf

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

v_text = joblib.load("vectorizer_text.pkl")
v_title = joblib.load("vectorizer_title.pkl")

X_tr_text  = v_text.transform(X_new["text"].astype(str))
X_tr_title = v_title.transform(X_new["title"].astype(str))
X_tr_all   = hstack([X_tr_text, X_tr_title])

X_tr_all.shape

(4489, 152612)

# Predict

### Logistic Regression

In [68]:

# load LR model
clf_lr = joblib.load("modelLR.pkl")

# make predictions
y_pred_lr = clf_lr.predict(X_tr_all)

print("Predictions:", y_pred_lr)


Predictions: [1 0 1 ... 0 0 0]


In [76]:
import pandas as pd

# Load your old data
data = pd.read_csv("test_data_no_labels.csv", encoding='latin-1')

# Add predictions as a new column
data["predictions"] = y_pred_lr  # y_pred_lr must match number of rows

# Save to new file
data.to_csv("predictionsLR.csv", index=False)


## Random Forest

In [77]:

# load RF model
clf_rf = joblib.load("modelRF.pkl")

# make predictions
y_pred_rf = clf_rf.predict(X_tr_all)

print("Predictions:", y_pred_rf)


Predictions: [1 1 1 ... 0 0 0]


In [78]:
import pandas as pd

# Load your old data
data = pd.read_csv("test_data_no_labels.csv", encoding='latin-1')

# Add predictions as a new column
data["predictions"] = y_pred_rf  # y_pred_rf must match number of rows

# Save to new file
data.to_csv("predictionsRF.csv", index=False)
