In [91]:
import pandas as pd 
import numpy as np
import matplotlib as plt
import seaborn as sns 

import nltk
import re
from nltk.corpus import stopwords
from nltk import word_tokenize, WordNetLemmatizer
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [92]:
# Loading the data 
df = pd.read_csv("df_small.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,article,publication
0,0,"This post is part of Polyarchy, an independent...",Vox
1,1,The Indianapolis Colts made Andrew Luck the h...,Business Insider
2,2,"DAVOS, Switzerland (Reuters) - U.S. President ...",Reuters
3,3,PARIS (Reuters) - Former French president Nico...,Reuters
4,4,Paris Hilton arrived at LAX Wednesday dressed ...,TMZ


In [93]:
# Removing columns 
df.drop(df.columns[[0]], axis = 1, inplace = True)
df.head()

Unnamed: 0,article,publication
0,"This post is part of Polyarchy, an independent...",Vox
1,The Indianapolis Colts made Andrew Luck the h...,Business Insider
2,"DAVOS, Switzerland (Reuters) - U.S. President ...",Reuters
3,PARIS (Reuters) - Former French president Nico...,Reuters
4,Paris Hilton arrived at LAX Wednesday dressed ...,TMZ


In [94]:
# Function for Cleaning text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean(text):
    # Convert to lowercase
    text = text.lower()

    # Remove non-alphanumeric characters and extra whitespace
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)

    # Tokenize the article into words
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens 
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a cleaned article
    cleaned_text = ' '.join(tokens)

    return cleaned_text

In [95]:
# Cleaning the text
df['article'] = df['article'].apply(clean)

In [96]:
df.head()

Unnamed: 0,article,publication
0,post part polyarchy independent blog produced ...,Vox
1,indianapolis colt made andrew luck highestpaid...,Business Insider
2,davos switzerland reuters u president donald t...,Reuters
3,paris reuters former french president nicolas ...,Reuters
4,paris hilton arrived lax wednesday dressed pay...,TMZ


In [97]:
df.shape

(2995, 2)

Embedding

Bag of Words (bow)

In [98]:
def bow (df,column):
    vect_bow = CountVectorizer()
    vect_bow.fit(df[column])
    bow_matrix = vect_bow.transform(df[column])
    return bow_matrix, bow_matrix.shape

bow(df, "article")

(<2995x61145 sparse matrix of type '<class 'numpy.int64'>'
 	with 688141 stored elements in Compressed Sparse Row format>,
 (2995, 61145))

In [99]:
# Get the unique values in the bow_matrix
unique_values = np.unique(bow_matrix.data)

# Print the unique values
print(unique_values)

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  50  51  52  53  54  55
  56  57  58  59  60  61  62  63  64  66  67  68  69  70  71  72  76  77
  78  79  81  82  83  84  85  86  87  88  90  95 100 101 106 130 132 149
 164 165 189 277 294]


In [100]:
vocabulary = vect_bow.get_feature_names_out()

# Find the words with the desired count
target_count = 189
target_words = [word for word, count in zip(vocabulary, bow_matrix.sum(axis=0).tolist()[0]) if count == target_count]

# Print the words
print(target_words)

['40', 'clearly', 'contract', 'direct', 'paris']


Word2Vec trained on df

In [111]:
from gensim.models import Word2Vec

def selfword2vec (df, column, word):
    # Prepare the data for training the Word2Vec model
    sentences = [article.split() for article in df[column]]
    
    # Train the Word2Vec model
    word2vec_df = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    
    # Get the word vector for a specific word
    word_vector = word2vec_df.wv[word]

    # Find similar words to a given word
    similar_words = word2vec_df.wv.most_similar(word, topn=5)

    print(similar_words)
    return word2vec_df

selfword2vec(df, "article", "contract")

#eotf2vec_df is the word2vec model that can be used for further steps

[('transaction', 0.9814221858978271), ('outlook', 0.9755303263664246), ('currency', 0.9706804156303406), ('restructuring', 0.96925950050354), ('phased', 0.9688061475753784)]


<gensim.models.word2vec.Word2Vec at 0x2497a75c550>

Word2vec google news 300

In [113]:
#import gensim.downloader as api

#wv = api.load('word2vec-google-news-300')

#pretrained word2vec model pre-trained on entire google news dataset


Topic Modeling - LDA