In [16]:

import nltk
import re
import string
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import joblib

In [17]:


# IF WORKING ON GOOGLE COLLAB

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Download the file from Google Drive (replace with your file ID)
!gdown --id 1q9EBzdsMiOWUngGHcNEU-LBNj7xrg8JH -O train_data.csv # Replace YOUR_FILE_ID with your actual file ID

import os
print(f"Current working directory: {os.getcwd()}")

# Check if the file was downloaded
if os.path.exists('train_data.csv'):
    print("File 'train_data.csv' found. Attempting to read with pandas.")
    data = pd.read_csv("train_data.csv", encoding='latin-1')
    print(data.shape)
    print(data.head())
else:
    print("Error: File 'train_data.csv' not found after gdown attempt.")
    print("Please double-check the file ID and ensure the file exists and is shared correctly in Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Downloading...
From: https://drive.google.com/uc?id=1q9EBzdsMiOWUngGHcNEU-LBNj7xrg8JH
To: /content/train_data.csv
100% 11.1M/11.1M [00:00<00:00, 73.9MB/s]
Current working directory: /content
File 'train_data.csv' found. Attempting to read with pandas.
(4489, 4)
                                               title  \
0  Syria toxic gas inquiry to end after Russia ag...   
1  Greeks march to mark 1973 student revolt again...   
2  China says will work with North Korea to boost...   
3  Argentina intensifies search for missing subma...   
4  MUST READ: Iâm Still Trying To Figure Out Ho...   

                                                text    subject        date  
0  UNITED NATIONS (Reuters) - An international in...  worldnews  2017-11-17  
1  ATHENS (Reuters) - Greek police clashed with h...  worldnews  2017-11-17  
2  BEIJING/SEOUL (Reuters) - Tradition

In [18]:
## Read Data

# import pandas as pd

# data = pd.read_csv("test_data_no_labels.csv",encoding='latin-1')

print(data.shape)

print(data.head())

(4489, 4)
                                               title  \
0  Syria toxic gas inquiry to end after Russia ag...   
1  Greeks march to mark 1973 student revolt again...   
2  China says will work with North Korea to boost...   
3  Argentina intensifies search for missing subma...   
4  MUST READ: Iâm Still Trying To Figure Out Ho...   

                                                text    subject        date  
0  UNITED NATIONS (Reuters) - An international in...  worldnews  2017-11-17  
1  ATHENS (Reuters) - Greek police clashed with h...  worldnews  2017-11-17  
2  BEIJING/SEOUL (Reuters) - Traditional friendsh...  worldnews  2017-11-17  
3  BUENOS AIRES (Reuters) - An Argentine submarin...  worldnews  2017-11-17  
4  Once you ve read this list, you re going to wa...  left-news  2017-11-17  


In [19]:
# Create a copy of the original DataFrame before dropping columns
original_data = data.copy()

# Display the head of the copied DataFrame to confirm
display(original_data.head())

Unnamed: 0,title,text,subject,date
0,Syria toxic gas inquiry to end after Russia ag...,UNITED NATIONS (Reuters) - An international in...,worldnews,2017-11-17
1,Greeks march to mark 1973 student revolt again...,ATHENS (Reuters) - Greek police clashed with h...,worldnews,2017-11-17
2,China says will work with North Korea to boost...,BEIJING/SEOUL (Reuters) - Traditional friendsh...,worldnews,2017-11-17
3,Argentina intensifies search for missing subma...,BUENOS AIRES (Reuters) - An Argentine submarin...,worldnews,2017-11-17
4,MUST READ: Iâm Still Trying To Figure Out Ho...,"Once you ve read this list, you re going to wa...",left-news,2017-11-17


# Preprocessing

In [20]:
# Drop unnecessary columns
data = data[['title']]

# Display the first few rows to verify the change
display(data.head())

Unnamed: 0,title
0,Syria toxic gas inquiry to end after Russia ag...
1,Greeks march to mark 1973 student revolt again...
2,China says will work with North Korea to boost...
3,Argentina intensifies search for missing subma...
4,MUST READ: Iâm Still Trying To Figure Out Ho...


In [21]:
from transformers import AutoTokenizer, AutoModel
import torch

# Choose model for embedding
embedding_model_name = "distilbert-base-uncased"

# Load tokenizer and model for embeddings
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
model = AutoModel.from_pretrained(embedding_model_name)

# Function to get embeddings
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings from the last hidden state (usually the [CLS] token embedding)
    # or average the token embeddings
    embeddings = outputs.last_hidden_state[:, 0, :].squeeze() # Using [CLS] token embedding
    # embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze() # Using mean pooling
    return embeddings

# Example usage (optional)
sample_text = "This is a sample sentence for embedding."
sample_embedding = get_embeddings(sample_text)

print(sample_embedding.shape)

torch.Size([768])


In [22]:
# Apply the get_embeddings function to the 'title' column and convert to a list of numpy arrays
title_embeddings_list = data['title'].apply(lambda x: get_embeddings(x).numpy()).tolist()

# Stack the list of numpy arrays to create X_tr_all
X_tr_all = np.vstack(title_embeddings_list)

print("Shape of X_tr_all:", X_tr_all.shape)

Shape of X_tr_all: (4489, 768)


# Predict

In [23]:
# load RF model
# Replace 'path/to/your/model/model_lr_transformer.pkl' with the actual path to your model file in Google Drive
model_path = '/content/drive/My Drive/model_lr_transformer.pkl'
clf_lr_trans = joblib.load(model_path)

# make predictions
# Assuming X_tr_all is available from previous steps
y_pred_lr_trans = clf_lr_trans.predict(X_tr_all)

print("Predictions:", y_pred_lr_trans)

Predictions: [1 1 1 ... 0 0 1]


In [27]:
import pandas as pd

# Load your old data
# data = pd.read_csv("test_data_no_labels.csv", encoding='latin-1') # Removing this line

# Add predictions as a new column to the original_data DataFrame
original_data["predictions"] = y_pred_lr_trans  # y_pred_lr_trans must match number of rows

# Save to new file
original_data.to_csv("predictionsLR_Transformers_with_original_data.csv", index=False)