In [1]:
import pandas as pd
import numpy as np

In [2]:
twit = pd.read_csv('data/text-preprocessing.csv', usecols=["tweet_id", "tweet_tokens_stemmed"])
twit.columns = ["label", "tweet"]
twit

Unnamed: 0,label,tweet
0,153619597713608704,"['lomba', 'poster', 'ilmiah', 'energi', 'baru'..."
1,153857491925610496,"['elaahhh', 'ngomong', 'energi', 'baru', 'tv',..."
2,154361424154603520,"['daerah', 'pencil', 'membutuhkam', 'listrik',..."
3,156397219346518017,"['tarik', 'materi', 'energi', 'baru', 'baek', ..."
4,156936339718279168,"['aneh', 'inget', 'jatropa', 'alias', 'minyak'..."
...,...,...
109700,1542534421219360772,"['bkpm', 'kembang', 'investasi', 'ebt', 'indon..."
109701,1542540419677859842,"['chemation', 'heat', 'exchanger', 'energi', '..."
109702,1542557674704740353,"['ebt', 'dadi', 'sumber', 'energi', 'listrik',..."
109703,1542592685978320896,"['seminar', 'nasional', 'himatikro', 'aktualis..."


In [3]:
# convert list formated string to list
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

twit["tweet_list"] = twit["tweet"].apply(convert_text_list)


print(twit["tweet_list"])

print("\ntype : ", type(twit["tweet_list"]))

0         [lomba, poster, ilmiah, energi, baru, deadline...
1               [elaahhh, ngomong, energi, baru, tv, biruu]
2         [daerah, pencil, membutuhkam, listrik, manfaat...
3         [tarik, materi, energi, baru, baek, ganti, nam...
4         [aneh, inget, jatropa, alias, minyak, jarak, b...
                                ...                        
109700    [bkpm, kembang, investasi, ebt, indonesia, ham...
109701    [chemation, heat, exchanger, energi, manfaat, ...
109702          [ebt, dadi, sumber, energi, listrik, utama]
109703    [seminar, nasional, himatikro, aktualisasi, pe...
109704    [pres, tarif, listrik, ebt, nanti, menteri, er...
Name: tweet_list, Length: 109705, dtype: object

type :  <class 'pandas.core.series.Series'>


In [4]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

twit["TF_dict"] = twit['tweet_list'].apply(calc_TF)

twit["TF_dict"].head(10)

0    {'lomba': 0.08333333333333333, 'poster': 0.083...
1    {'elaahhh': 0.16666666666666666, 'ngomong': 0....
2    {'daerah': 0.14285714285714285, 'pencil': 0.14...
3    {'tarik': 0.1111111111111111, 'materi': 0.1111...
4    {'aneh': 0.07692307692307693, 'inget': 0.07692...
5    {'eco': 0.1, 'power': 0.1, 'booster': 0.1, 'ka...
6    {'mahasiswa': 0.18181818181818182, 'tawur': 0....
7    {'misi': 0.07692307692307693, 'jalan': 0.07692...
8    {'seru': 0.09090909090909091, 'adik': 0.090909...
9    {'usaha': 0.125, 'nuklir': 0.125, 'salah': 0.1...
Name: TF_dict, dtype: object

In [5]:
# Check TF result
index = len(twit)-1

print('%20s' % "term", "\t", "TF\n")
for key in twit["TF_dict"][index]:
    print('%20s' % key, "\t", twit["TF_dict"][index][key])

                term 	 TF

                pres 	 0.125
               tarif 	 0.125
             listrik 	 0.125
                 ebt 	 0.125
               nanti 	 0.125
             menteri 	 0.125
               erick 	 0.125
               teken 	 0.125


In [6]:
def calc_IDF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_IDF(twit["TF_dict"])

In [7]:
DF

{'lomba': 229,
 'poster': 41,
 'ilmiah': 53,
 'energi': 48821,
 'baru': 41118,
 'deadline': 29,
 'januari': 104,
 'untuk': 19,
 'siswa': 24,
 'sma': 101,
 'smk': 28,
 'derajat': 30,
 'elaahhh': 1,
 'ngomong': 76,
 'tv': 207,
 'biruu': 2,
 'daerah': 1106,
 'pencil': 241,
 'membutuhkam': 1,
 'listrik': 12360,
 'manfaat': 4794,
 'tarik': 883,
 'materi': 118,
 'baek': 6,
 'ganti': 922,
 'nama': 260,
 'propasif': 1,
 'ajah': 16,
 'aneh': 55,
 'inget': 56,
 'jatropa': 1,
 'alias': 76,
 'minyak': 1544,
 'jarak': 40,
 'bbrp': 100,
 'digembar': 2,
 'gembor': 16,
 'ebt': 50625,
 'eco': 68,
 'power': 1298,
 'booster': 10,
 'karya': 283,
 'mahasiswa': 521,
 'universitas': 232,
 'nasional': 2460,
 'kembang': 14357,
 'tawur': 2,
 'misi': 113,
 'jalan': 1480,
 'usaha': 3602,
 'gas': 1732,
 'integrasi': 285,
 'dasar': 392,
 'prinsip': 141,
 'komersial': 128,
 'kuat': 933,
 'seru': 146,
 'adik': 27,
 'sd': 101,
 'pangudi': 1,
 'luhur': 58,
 'solo': 28,
 'praktek': 25,
 'bsama': 1,
 'sharp': 6,
 'nuklir

In [8]:
n_document = len(twit)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)

In [9]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
twit["TF-IDF_dict"] = twit["TF_dict"].apply(calc_TF_IDF)

In [10]:
# Check TF-IDF result
index = len(twit)-1

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in twit["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", twit["TF_dict"][index][key] ,"\t" , twit["TF-IDF_dict"][index][key])


                term 	         TF 	              TF-IDF

                pres 	 0.125 	 0.6848351299952332
               tarif 	 0.125 	 0.6270779683178164
             listrik 	 0.125 	 0.2729060737735284
                 ebt 	 0.125 	 0.09666620834698497
               nanti 	 0.125 	 1.009898712432486
             menteri 	 0.125 	 0.36073826307133655
               erick 	 0.125 	 0.6426202400482611
               teken 	 0.125 	 0.7088320035570579


In [11]:
# sort descending by value for DF dictionary 
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)[:50]

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

twit["TF_IDF_Vec"] = twit["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(twit["TF_IDF_Vec"][0])

print("\nmatrix size : ", len(twit["TF_IDF_Vec"][0]))

print first row matrix TF_IDF_Vec Series

[0.0, 0.06746782617927817, 0.08177705360907206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

matrix size :  50


In [12]:
# Convert Series to List
TF_IDF_Vec_List = np.array(twit["TF_IDF_Vec"].to_list())

# Sum element vector in axis=0 
sums = TF_IDF_Vec_List.sum(axis=0)

data = []

for col, term in enumerate(unique_term):
    data.append((term, sums[col]))
    
ranking = pd.DataFrame(data, columns=['term', 'rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
0,ebt,5457.923631
1,energi,5445.268782
2,baru,4936.265252
3,energy,3919.499714
4,renewable,3802.270494
5,kembang,3786.339779
6,indonesia,3013.389142
7,listrik,3003.017515
8,pln,2957.979364
9,perintah,2734.515805
