# Home task: Topic modeling

In [70]:
import gensim
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

Loading dataset

In [71]:
fn= './data/voted-kaggle-dataset.csv'
df = pd.read_csv(fn)

Clean 'nan' values

In [72]:
df = df[df['Description'].notna()]
len(df)

2145

In [73]:
print ('len of texts = {:,}'.format(len(df)))
index = 10 
data = df['Description'].tolist()
data[index]

len of texts = 2,145


'These files contain complete loan data for all loans issued through the 2007-2015, including the current loan status (Current, Late, Fully Paid, etc.) and latest payment information. The file containing loan data through the "present" contains complete loan data for all loans issued through the previous completed calendar quarter. Additional features include credit scores, number of finance inquiries, address including zip codes, and state, and collections among others. The file is a matrix of about 890 thousand observations and 75 variables. A data dictionary is provided in a separate file. k'

Define vectorizer

In [74]:
three_words_pattern = r"\b\w{3,}\b"
vectorizer = CountVectorizer(
    min_df=20, 
    stop_words='english',
    token_pattern=three_words_pattern) 
vectorizer.fit(data)

Vectorize dataset

In [75]:
data_vectorized = vectorizer.transform(data)

Create gensim corpus

In [76]:
corpus = gensim.matutils.Sparse2Corpus(data_vectorized, documents_columns=False)
[item for item in corpus][:5]

[[(18, 1),
  (20, 1),
  (39, 1),
  (41, 2),
  (120, 1),
  (150, 1),
  (154, 1),
  (174, 1),
  (218, 1),
  (221, 1),
  (257, 1),
  (264, 3),
  (266, 2),
  (283, 1),
  (324, 1),
  (328, 1),
  (353, 3),
  (380, 1),
  (402, 1),
  (411, 1),
  (417, 3),
  (420, 4),
  (421, 1),
  (425, 1),
  (456, 1),
  (458, 1),
  (567, 1),
  (575, 1),
  (613, 4),
  (614, 3),
  (695, 1),
  (715, 1),
  (736, 1),
  (753, 3),
  (767, 1),
  (804, 1),
  (808, 1),
  (822, 1),
  (841, 1),
  (893, 2),
  (948, 1),
  (975, 1),
  (1014, 2),
  (1087, 1),
  (1089, 1),
  (1111, 1),
  (1139, 1),
  (1192, 1),
  (1224, 1),
  (1246, 1),
  (1252, 1),
  (1289, 1),
  (1325, 1),
  (1357, 1),
  (1366, 1),
  (1370, 1),
  (1425, 1),
  (1445, 1),
  (1574, 1),
  (1605, 2),
  (1615, 1),
  (1625, 3),
  (1626, 4),
  (1654, 1),
  (1675, 1),
  (1680, 1),
  (1686, 1),
  (1688, 1),
  (1689, 1)],
 [(0, 3),
  (13, 1),
  (21, 2),
  (30, 2),
  (34, 2),
  (45, 1),
  (60, 1),
  (83, 1),
  (91, 1),
  (102, 1),
  (105, 1),
  (128, 1),
  (140, 5),
  

Create id2word dictionary

In [80]:
id_map = dict((v, k) for k, v in vectorizer.vocabulary_.items()) 
id_map

{421: 'datasets',
 353: 'contains',
 1626: 'transactions',
 402: 'credit',
 218: 'cards',
 1445: 'september',
 18: '2013',
 567: 'european',
 420: 'dataset',
 1089: 'occurred',
 425: 'days',
 736: 'highly',
 1192: 'positive',
 264: 'class',
 39: 'account',
 808: 'input',
 1689: 'variables',
 1370: 'result',
 1654: 'unfortunately',
 841: 'issues',
 1252: 'provide',
 1111: 'original',
 614: 'features',
 154: 'background',
 804: 'information',
 417: 'data',
 324: 'components',
 1087: 'obtained',
 1605: 'time',
 613: 'feature',
 1425: 'seconds',
 1625: 'transaction',
 1675: 'used',
 575: 'example',
 380: 'cost',
 893: 'learning',
 1366: 'response',
 1688: 'variable',
 1574: 'takes',
 1686: 'value',
 221: 'case',
 695: 'given',
 1289: 'ratio',
 41: 'accuracy',
 1680: 'using',
 120: 'area',
 975: 'matrix',
 266: 'classification',
 283: 'collected',
 1357: 'research',
 948: 'machine',
 715: 'group',
 753: 'http',
 174: 'big',
 1014: 'mining',
 458: 'detection',
 456: 'details',
 411: 'current

Create LDA model

In [81]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=6, id2word=id_map, passes=25, random_state=34)

In [84]:
ldamodel.print_topics(num_topics=6,num_words=6)

[(0,
  '0.025*"data" + 0.016*"dataset" + 0.008*"set" + 0.008*"class" + 0.008*"information" + 0.008*"instances"'),
 (1,
  '0.471*"university" + 0.090*"state" + 0.060*"college" + 0.026*"california" + 0.019*"institute" + 0.015*"north"'),
 (2,
  '0.028*"dataset" + 0.025*"data" + 0.012*"time" + 0.011*"number" + 0.009*"csv" + 0.009*"player"'),
 (3,
  '0.055*"dataset" + 0.024*"does" + 0.023*"description" + 0.023*"data" + 0.017*"model" + 0.017*"trained"'),
 (4,
  '0.023*"csv" + 0.022*"dataset" + 0.011*"text" + 0.011*"contains" + 0.010*"file" + 0.009*"content"'),
 (5,
  '0.063*"data" + 0.014*"dataset" + 0.014*"content" + 0.013*"context" + 0.011*"inspiration" + 0.011*"acknowledgements"')]

Classify new text

In [85]:
new_doc = ["I study at the University of North Carolina at Chapel Hill"] 
print (new_doc[0])

I study at the University of North Carolina at Chapel Hill


In [86]:
doc_vectorized = vectorizer.transform(new_doc)
new_doc_corpus = gensim.matutils.Sparse2Corpus(doc_vectorized, documents_columns=False)
doc_topics = ldamodel.get_document_topics(new_doc_corpus)
list(doc_topics)

[[(0, 0.29150218),
  (1, 0.5417198),
  (2, 0.041713953),
  (3, 0.0416779),
  (4, 0.041668802),
  (5, 0.04171738)]]