In [1]:
#importing packages

import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\linda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
#reading in data
v_college = pd.read_csv("data/vaccine_college_500.csv")
v_college

Unnamed: 0,date,text,retweet_count,favorite_count,reply
0,2021-06-23 23:23:46,b'Pin Code:[411041] \nNavale Medical College\n...,0,0,0
1,2021-06-23 23:16:54,b'@RobSchneider If a college forces the vaccin...,0,0,1
2,2021-06-23 23:14:58,b'@senatenj My Deans List Sophomore unwelcome ...,0,0,0
3,2021-06-23 23:12:56,b'@NYGovCuomo are you paying out if your perso...,0,0,0
4,2021-06-23 23:09:29,b'@tedcruz please help my daughter in #NJ (@Go...,0,0,0
...,...,...,...,...,...
2855,2021-06-15 14:32:05,b'Medgar Evers College\nUniversity at Buffalo ...,0,0,1
2856,2021-06-15 14:30:01,"b'Ellie Ok, a second-year student in the Colle...",0,2,0
2857,2021-06-15 14:23:20,b'@unmtaos has joined the \n@WhiteHouse #COVID...,0,0,0
2858,2021-06-15 14:16:33,b'Medgar Evers College\nUniversity at Buffalo ...,0,0,1


In [3]:
#cleaning
v_college["clean tweets"] = v_college["text"].str[2:] #dropping first two characters apostrophe and b
v_college["clean tweets"] = v_college["clean tweets"].str[:-1] #dropping last character apostrophe
tweets = v_college["clean tweets"]

In [4]:
tweets = [s.replace("\\n", " ") for s in tweets] #dropping \n
tweets = [s.replace(",", "") for s in tweets] #dropping ,
v_college["clean tweets"] = tweets

In [5]:
def cleaning(text): #to remove mentions and links, taken partially from juejue's and Jamie's notebooks
    text = re.sub('\S*@\S*\s?', '',text) #remove emails
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text) #remove links
    text = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", '', text) #also removing links + punctuation
    return text

v_college["clean tweets"] = v_college["clean tweets"].apply(cleaning)

In [6]:
#stemming words
stemmer = SnowballStemmer("english")
text = v_college["clean tweets"].to_list()
process_tweets = []
for sentence in text:
    #iterate through each word in a tweet/sentence, if not part of stopwords list then keep in tweet
    process_tweets.append(" ".join([stemmer.stem(i) for i in sentence.split() if i not in gensim.parsing.preprocessing.STOPWORDS]))


#get our tokens/words  
tokens = [[word for word in process_tweets.split()] for process_tweets in process_tweets]
v_college["clean tweets"] = tokens

In [12]:
#quick look at final dataframe
v_college

Unnamed: 0,date,text,retweet_count,favorite_count,reply,clean tweets
0,2021-06-23 23:23:46,b'Pin Code:[411041] \nNavale Medical College\n...,0,0,0,"[pin, code411041, naval, medic, colleg, vaccin..."
1,2021-06-23 23:16:54,b'@RobSchneider If a college forces the vaccin...,0,0,1,"[if, colleg, forc, vaccin, su, exist, wreck, f..."
2,2021-06-23 23:14:58,b'@senatenj My Deans List Sophomore unwelcome ...,0,0,0,"[my, dean, list, sophomor, unwelcom, colleg, w..."
3,2021-06-23 23:12:56,b'@NYGovCuomo are you paying out if your perso...,0,0,0,"[pay, person, pocket, colleg, scholarship, gav..."
4,2021-06-23 23:09:29,b'@tedcruz please help my daughter in #NJ (@Go...,0,0,0,"[help, daughter, nj, return, colleg, shes, dec..."
...,...,...,...,...,...,...
2855,2021-06-15 14:32:05,b'Medgar Evers College\nUniversity at Buffalo ...,0,0,1,"[medgar, ever, colleg, univers, buffalo, south..."
2856,2021-06-15 14:30:01,"b'Ellie Ok, a second-year student in the Colle...",0,2,0,"[elli, ok, secondyear, student, colleg, osteop..."
2857,2021-06-15 14:23:20,b'@unmtaos has joined the \n@WhiteHouse #COVID...,0,0,0,"[join, covidcollegechalleng, a, vaccin, champi..."
2858,2021-06-15 14:16:33,b'Medgar Evers College\nUniversity at Buffalo ...,0,0,1,"[medgar, ever, colleg, univers, buffalo, south..."


In [8]:
#gensim dictionary and corpus

#creating dictionary
tweet_dict = gensim.corpora.Dictionary(tokens)

#bag of words corpus
bow_corpus = [tweet_dict.doc2bow(text) for text in tokens]

In [15]:
#implementing LDA model
# number of topics
topic_number = 5

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus,
                                       id2word=tweet_dict,
                                       num_topics=topic_number)
lda_model.print_topics(num_words=20) #giving the combination of words for each topic

[(0,
  '0.055*"vaccin" + 0.035*"colleg" + 0.014*"covid19" + 0.011*"covid" + 0.008*"student" + 0.007*"capac" + 0.006*"requir" + 0.006*"school" + 0.005*"the" + 0.005*"dose" + 0.005*"age" + 0.005*"univers" + 0.005*"dont" + 0.005*"like" + 0.004*"i" + 0.004*"year" + 0.004*"receiv" + 0.004*"medic" + 0.004*"no" + 0.004*"risk"'),
 (1,
  '0.062*"vaccin" + 0.051*"colleg" + 0.019*"i" + 0.016*"student" + 0.011*"the" + 0.008*"covid19" + 0.007*"covid" + 0.006*"suni" + 0.005*"requir" + 0.005*"school" + 0.005*"peopl" + 0.005*"campus" + 0.005*"dose" + 0.004*"got" + 0.004*"mandat" + 0.004*"it" + 0.004*"center" + 0.004*"amp" + 0.004*"univers" + 0.004*"fall"'),
 (2,
  '0.036*"colleg" + 0.032*"vaccin" + 0.023*"suni" + 0.015*"center" + 0.012*"arena" + 0.011*"dome" + 0.011*"rochest" + 0.010*"westburi" + 0.010*"medgar" + 0.010*"the" + 0.009*"univers" + 0.009*"nys" + 0.009*"old" + 0.009*"fall" + 0.009*"buffalo" + 0.009*"niagara" + 0.009*"ever" + 0.008*"campus" + 0.008*"pune" + 0.007*"south"'),
 (3,
  '0.045*"v

In [None]:
`
import numpy as np
from scipy.sparse import coo_matrix
from coclust.coclustering import CoclustSpecMod
from coclust.visualization import plot_cluster_sizes
n_clust = 4
file_name = "data/dtm_college.csv"


a = np.loadtxt(file_name, delimiter = ',')#, skiprows = 1)
X = (coo_matrix((a[:, 2], (a[:, 0].astype(int), a[:, 1].astype(int)))))
X = X.tocsr()

model = CoclustSpecMod(n_clusters = n_clust, random_state = 0)
model.fit(X)
plot_cluster_sizes(model)


In [22]:
import numpy as np
from scipy.sparse import coo_matrix
from coclust.coclustering import CoclustSpecMod
from coclust.visualization import plot_cluster_sizes
dtm = pd.read_csv("data/dtm_college.csv")
#coo_matrix((a[:, 2], (a[:, 0].astype(int), a[:, 1].astype(int)))) copied from the paper

In [38]:
import scipy
dtm = pd.read_csv("data/dtm_college.csv")
csrtest = scipy.sparse.csr_matrix(dtm.values)
model = CoclustSpecMod(n_clusters = 5, random_state = 0)
model.fit(csrtest)

TypeError: check_array() got an unexpected keyword argument 'warn_on_dtype'