In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('US-Economic-News.csv', delimiter=',', encoding= 'ISO-8859-1')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               8000 non-null   int64  
 1   _golden                8000 non-null   bool   
 2   _unit_state            8000 non-null   object 
 3   _trusted_judgments     8000 non-null   int64  
 4   _last_judgment_at      8000 non-null   object 
 5   positivity             1420 non-null   float64
 6   positivity:confidence  3775 non-null   float64
 7   relevance              8000 non-null   object 
 8   relevance:confidence   8000 non-null   float64
 9   articleid              8000 non-null   object 
 10  date                   8000 non-null   object 
 11  headline               8000 non-null   object 
 12  positivity_gold        0 non-null      float64
 13  relevance_gold         0 non-null      float64
 14  text                   8000 non-null   object 
dtypes: b

In [3]:
df.head(5)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,positivity,positivity:confidence,relevance,relevance:confidence,articleid,date,headline,positivity_gold,relevance_gold,text
0,842613455,False,finalized,3,12/5/15 17:48,3.0,0.64,yes,0.64,wsj_398217788,8/14/91,Yields on CDs Fell in the Latest Week,,,NEW YORK -- Yields on most certificates of dep...
1,842613456,False,finalized,3,12/5/15 16:54,,,no,1.0,wsj_399019502,8/21/07,The Morning Brief: White House Seeks to Limit ...,,,The Wall Street Journal Online</br></br>The Mo...
2,842613457,False,finalized,3,12/5/15 1:59,,,no,1.0,wsj_398284048,11/14/91,Banking Bill Negotiators Set Compromise --- Pl...,,,WASHINGTON -- In an effort to achieve banking ...
3,842613458,False,finalized,3,12/5/15 2:19,,0.0,no,0.675,wsj_397959018,6/16/86,Manager's Journal: Sniffing Out Drug Abusers I...,,,The statistics on the enormous costs of employ...
4,842613459,False,finalized,3,12/5/15 17:48,3.0,0.3257,yes,0.64,wsj_398838054,10/4/02,Currency Trading: Dollar Remains in Tight Rang...,,,NEW YORK -- Indecision marked the dollar's ton...


In [4]:
# tengo que agarrar y cambiar los dtype de las variables con encoding
# drop positivity gold and relevance gold
# positivity and ::confidence, have very sparse values
# _unit_state has only the finalized observation and golden the False observation, and trusted judgments only observation '3'

df = df.drop(columns = ['_unit_id', 'positivity_gold', 'relevance_gold', '_unit_state', '_golden', '_trusted_judgments',
                         '_unit_state', '_last_judgment_at', 'positivity', 'positivity:confidence'])

df['articleid'] = df['articleid'].str.slice(0, 3)
# so far, these variables SHOULD be dropped for sure as well as unit id and article id
# should discuss between the positivity and positivity confidence due to amount of null values

In [5]:
df.head()
filtered_df = df[df['relevance'] == 'yes']

filtered_df['articleid'].value_counts()



articleid
wsj    938
wap    482
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['relevance'])
y = df['relevance']

# here stratify is used to balance dataset as much as possible
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [7]:
print(y_train.value_counts())
print(y_test.value_counts())
# is kind of balanced yes

relevance
no          5257
yes         1136
not sure       7
Name: count, dtype: int64
relevance
no          1314
yes          284
not sure       2
Name: count, dtype: int64


In [8]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# Ensure you have downloaded the necessary NLTK data
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

def clean_text(text):
    # Remove punctuations and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase
    return text.lower()

def preprocess_text(text):
    # Clean the text
    text = clean_text(text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return ' '.join(lemmatized_tokens)

train_tokenized_text = [preprocess_text(doc) for doc in X_train['text']]
train_tokenized_head = [preprocess_text(doc) for doc in X_train['headline']]


X_train['clean_text'] = df['text'].apply(preprocess_text)
X_train['clean_headline'] = df['headline'].apply(preprocess_text)
X_test['clean_text'] = df['text'].apply(preprocess_text)
X_test['clean_headline'] = df['headline'].apply(preprocess_text)

combined_tokenized_text = train_tokenized_text + train_tokenized_head


In [9]:
import gensim

word2vec_model = gensim.models.Word2Vec(combined_tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

In [10]:
import numpy as np
def document_vector(word2vec_model, doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)

# Vectorize each column in training and test sets
X_train['vectorized_text'] = [document_vector(word2vec_model, doc) for doc in X_train['clean_text']]
X_train['vectorized_headline'] = [document_vector(word2vec_model, doc) for doc in X_train['clean_headline']]

X_test['vectorized_text'] = [document_vector(word2vec_model, doc) for doc in X_test['clean_text']]
X_test['vectorized_headline'] = [document_vector(word2vec_model, doc) for doc in X_test['clean_headline']]

In [11]:
X_train = X_train.drop(columns=['headline', 'text', 'clean_text', 'clean_headline'])
X_test = X_test.drop(columns=['headline', 'text', 'clean_text', 'clean_headline'])

In [12]:
X_test.head(5)

Unnamed: 0,relevance:confidence,articleid,date,vectorized_text,vectorized_headline
6006,1.0,wap,9/28/86,"[0.087300286, 0.011467085, -0.03953612, 0.0503...","[0.14060357, 0.00290192, -0.037105225, 0.04222..."
2304,1.0,wsj,3/28/96,"[0.09097697, 0.0067355796, -0.032678287, 0.049...","[0.12639074, 0.026700534, -0.04324411, -0.0019..."
7059,1.0,wap,1/9/93,"[0.09460699, 0.003948897, -0.044204634, 0.0554...","[-0.015157646, -0.03469068, -0.018657232, 0.07..."
5880,1.0,wap,2/9/00,"[0.09538122, 0.010254582, -0.030811192, 0.0432...","[0.09177009, 0.045725744, -0.030763423, -0.085..."
6700,1.0,wap,8/23/75,"[0.080340706, 0.00939782, -0.021030651, 0.0446...","[0.10776859, 0.00045859083, -0.026210537, 0.02..."
