In [26]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from sklearn.metrics import classification_report
import numpy as np

In [27]:
df =  pd.read_csv("train.csv")
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [28]:
df['News'] = df['Title'] + " " + df['Description']
df = df[['Class Index', 'News']]
df.head()

Unnamed: 0,Class Index,News
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


In [29]:
df['News'] = df['News'].str.lower()
df['News']

0         wall st. bears claw back into the black (reute...
1         carlyle looks toward commercial aerospace (reu...
2         oil and economy cloud stocks' outlook (reuters...
3         iraq halts oil exports from main southern pipe...
4         oil prices soar to all-time record, posing new...
                                ...                        
119995    pakistan's musharraf says won't quit as army c...
119996    renteria signing a top-shelf deal red sox gene...
119997    saban not going to dolphins yet the miami dolp...
119998    today's nfl games pittsburgh at ny giants time...
119999    nets get carter from raptors indianapolis -- a...
Name: News, Length: 120000, dtype: object

In [30]:
df['News'] = df['News'].apply(word_tokenize)
df['News']

0         [wall, st., bears, claw, back, into, the, blac...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, and, economy, cloud, stocks, ', outlook,...
3         [iraq, halts, oil, exports, from, main, southe...
4         [oil, prices, soar, to, all-time, record, ,, p...
                                ...                        
119995    [pakistan, 's, musharraf, says, wo, n't, quit,...
119996    [renteria, signing, a, top-shelf, deal, red, s...
119997    [saban, not, going, to, dolphins, yet, the, mi...
119998    [today, 's, nfl, games, pittsburgh, at, ny, gi...
119999    [nets, get, carter, from, raptors, indianapoli...
Name: News, Length: 120000, dtype: object

In [31]:
stop_words = set(stopwords.words("english"))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [32]:
for i in range(len(df)):
    words = df.loc[i, 'News']
    filtered = []
    for w in words:
        if w.isalpha() and w not in stop_words:
            filtered.append(w)

    df.at[i, 'News'] = filtered

df['News']

0         [wall, bears, claw, back, black, reuters, reut...
1         [carlyle, looks, toward, commercial, aerospace...
2         [oil, economy, cloud, stocks, outlook, reuters...
3         [iraq, halts, oil, exports, main, southern, pi...
4         [oil, prices, soar, record, posing, new, menac...
                                ...                        
119995    [pakistan, musharraf, says, wo, quit, army, ch...
119996    [renteria, signing, deal, red, sox, general, m...
119997    [saban, going, dolphins, yet, miami, dolphins,...
119998    [today, nfl, games, pittsburgh, ny, giants, ti...
119999    [nets, get, carter, raptors, indianapolis, vin...
Name: News, Length: 120000, dtype: object

In [33]:
lem = WordNetLemmatizer()

for i in range(len(df)):
    words = df.loc[i, 'News']
    lemmatized = []
    for w in words:
        lemmatized.append(lem.lemmatize(w, "v"))

    df.at[i, 'News'] = lemmatized

df.head()


Unnamed: 0,Class Index,News
0,3,"[wall, bear, claw, back, black, reuters, reute..."
1,3,"[carlyle, look, toward, commercial, aerospace,..."
2,3,"[oil, economy, cloud, stock, outlook, reuters,..."
3,3,"[iraq, halt, oil, export, main, southern, pipe..."
4,3,"[oil, price, soar, record, pose, new, menace, ..."


In [None]:
# sentences = df['News'].tolist()

# word2vec = Word2Vec(
#     sentences=sentences,
#     vector_size=100,
#     window=5,
#     min_count=2,
#     sg=0,
#     epochs=155,
#     alpha=0.025
# )

In [None]:
# # print(word2vec.wv['oil'])
# print(word2vec.wv.most_similar('oil', topn=5))

[('crude', 0.6871545910835266), ('barrel', 0.582028329372406), ('energy', 0.558076024055481), ('highs', 0.5504552125930786), ('treasuries', 0.5439140200614929)]


In [None]:
# X = []

# for tokens in df['News']:
#     valid_words = [w for w in words if w in word2vec.wv]

#     if len(valid_words) == 0:
#         vec = np.zeros(word2vec.vector_size)

#     else:
#         vec = np.mean(word2vec.wv[valid_words], axis=0)

#     X.append(vec)

In [None]:
# X = np.array(X)
# y = df['Class Index']

In [34]:
texts = [" ".join(tokens) for tokens in df['News']]

In [36]:
vectorizer = TfidfVectorizer(max_features=5000)

In [37]:
X = vectorizer.fit_transform(texts)
y = df['Class Index']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
model = LogisticRegression()

In [40]:
model.fit(X_train, y_train)

In [41]:
y_pred = model.predict(X_test)

In [42]:
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Classification Report: 
              precision    recall  f1-score   support

           1       0.92      0.90      0.91      5956
           2       0.95      0.97      0.96      6058
           3       0.87      0.88      0.87      5911
           4       0.89      0.88      0.88      6075

    accuracy                           0.91     24000
   macro avg       0.91      0.91      0.91     24000
weighted avg       0.91      0.91      0.91     24000

