### Problem 2  


#### Import libraries  

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import preprocess_string
from gensim.models.phrases import Phrases, Phraser

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#### Parameters  

In [2]:
csv_in = 'dl-end1-2.csv'
min_count = 10
min_words = 50

embed_size = 100

#### Read CSV file  

In [3]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(1786, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1786 entries, 0 to 1785
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   content       1786 non-null   object
 1   target        1786 non-null   int64 
 2   target_names  1786 non-null   object
dtypes: int64(1), object(2)
memory usage: 42.0+ KB
None


Unnamed: 0,content,target,target_names
0,"From article <C5owCB.n3p@world.std.com>, by t...",1,sci.space
1,In article <1993Apr23.184732.1105@aio.jsc.nas...,1,sci.space
2,What is the EXACT entry (parameter and synta...,2,comp.windows.x
3,In article <1993Apr20.151818.4319@samba.oit.u...,0,rec.sport.hockey
4,In article <120666@netnews.upenn.edu> kkeller...,0,rec.sport.hockey


In [4]:
def func(s):
    return len(s.split())

#### (A) Delete too short docs  

In [5]:
df = df[ df['content'].map(lambda x: len(x.split())) >= min_words ]  
# df = df[ df['content'].map(func) >= min_words ]  
df = df.reset_index(drop=True)
print(df.shape)

(1595, 3)


#### Remove stop_words, punctuations, etc.   

In [6]:
df['content'] = df['content'].map(preprocess_string)
display(df.head())

Unnamed: 0,content,target,target_names
0,"[articl, tombak, world, std, com, tom, baker, ...",1,sci.space
1,"[articl, kjenk, gothamc, jsc, nasa, gov, write...",1,sci.space
2,"[exact, entri, paramet, syntax, termin, config...",2,comp.windows.x
3,"[articl, scott, mark, launchpad, unc, edu, sco...",0,rec.sport.hockey
4,"[articl, kkeller, mail, sa, upenn, edu, keith,...",0,rec.sport.hockey


#### Detect Bi-gram  

In [7]:
words = []
for i in range(len(df)):
    #print(i, df.at[i, 'content'])
    words.append(df.at[i, 'content'])
    #print(len(words))  # debug
#print(words[:5])  # debug

In [8]:
%%time

phrases_bi = Phrases(words, min_count=30, threshold=10.0)
bigram = Phraser(phrases_bi)
df['content'] = df['content'].map(lambda x: bigram[x])
display(df.head())

Unnamed: 0,content,target,target_names
0,"[articl, tombak, world, std, com, tom, baker, ...",1,sci.space
1,"[articl, kjenk, gothamc, jsc, nasa_gov, write,...",1,sci.space
2,"[exact, entri, paramet, syntax, termin, config...",2,comp.windows.x
3,"[articl, scott, mark, launchpad, unc, edu, sco...",0,rec.sport.hockey
4,"[articl, kkeller, mail, sa, upenn, edu, keith,...",0,rec.sport.hockey


CPU times: user 2.4 s, sys: 20.2 ms, total: 2.42 s
Wall time: 2.42 s


#### Assign docID according to its category  


In [9]:
docID = []
j = np.zeros(len(df['target'].value_counts()))
for i in range(len(df)):
    tgt = df.at[i, 'target']

    docID.append('d'+str(int(tgt*1000+j[tgt])))
    
    j[tgt] += 1

df['docID'] = docID
display(df.head())

Unnamed: 0,content,target,target_names,docID
0,"[articl, tombak, world, std, com, tom, baker, ...",1,sci.space,d1000
1,"[articl, kjenk, gothamc, jsc, nasa_gov, write,...",1,sci.space,d1001
2,"[exact, entri, paramet, syntax, termin, config...",2,comp.windows.x,d2000
3,"[articl, scott, mark, launchpad, unc, edu, sco...",0,rec.sport.hockey,d0
4,"[articl, kkeller, mail, sa, upenn, edu, keith,...",0,rec.sport.hockey,d1


#### Calculation of Doc2Vec  

In [10]:
docs = []
for i in range(len(df)):
    c = df.at[i, 'content']
    doc_id = df.at[i, 'docID']
    # make TaggedDocument
    td = TaggedDocument(words=c, tags=[doc_id])
    docs.append(td)

#### Calculation of doc vectors  

In [11]:
%%time

model = Doc2Vec(documents=docs, vector_size=embed_size,
                min_count=min_count, dm=0, epochs=20)  

print(model)

model.init_sims(replace=True)



Doc2Vec(dbow,d100,n5,mc10,s0.001,t3)
CPU times: user 7.01 s, sys: 156 ms, total: 7.17 s
Wall time: 3.43 s


##### Check word set  

In [12]:
print(len(model.wv.vocab.keys()))  # number of words
print(list(model.wv.vocab.keys())[:10])  # show first 10 words

3541
['articl', 'world', 'com', 'tom', 'baker', 'edu', 'pack', 'write', 'clear', 'warn']


In [13]:
docvecs = model.docvecs.vectors_docs
print(docvecs.shape)

(1595, 100)


#### (B) K-Means clustering  

In [14]:
n_cls = 5
km =  __(B1)__
cls = km.fit_predict(docvecs)

SyntaxError: invalid syntax (2543988675.py, line 2)

#### Check correspondence of target and clusters  

In [None]:
display(pd.crosstab(df['target'], cls))