In [1]:
import pandas as pd
import numpy as np

In [2]:
twit = pd.read_csv('data/text-preprocessing.csv', usecols=["tweet_id", "tweet_tokens_stemmed"])
twit.columns = ["label", "tweet"]
twit

Unnamed: 0,label,tweet
0,153619597713608704,"['lomba', 'poster', 'ilmiah', 'energi', 'baru'..."
1,153857491925610496,"['elaahhh', 'ngomong', 'energi', 'baru', 'tv',..."
2,154361424154603520,"['daerah', 'pencil', 'membutuhkam', 'listrik',..."
3,156397219346518017,"['tarik', 'materi', 'energi', 'baru', 'baek', ..."
4,156936339718279168,"['aneh', 'inget', 'jatropa', 'alias', 'minyak'..."
...,...,...
109700,1542534421219360772,"['bkpm', 'kembang', 'investasi', 'ebt', 'indon..."
109701,1542540419677859842,"['chemation', 'heat', 'exchanger', 'energi', '..."
109702,1542557674704740353,"['ebt', 'dadi', 'sumber', 'energi', 'listrik',..."
109703,1542592685978320896,"['seminar', 'nasional', 'himatikro', 'aktualis..."


In [3]:
print(twit['tweet'])

0         ['lomba', 'poster', 'ilmiah', 'energi', 'baru'...
1         ['elaahhh', 'ngomong', 'energi', 'baru', 'tv',...
2         ['daerah', 'pencil', 'membutuhkam', 'listrik',...
3         ['tarik', 'materi', 'energi', 'baru', 'baek', ...
4         ['aneh', 'inget', 'jatropa', 'alias', 'minyak'...
                                ...                        
109700    ['bkpm', 'kembang', 'investasi', 'ebt', 'indon...
109701    ['chemation', 'heat', 'exchanger', 'energi', '...
109702    ['ebt', 'dadi', 'sumber', 'energi', 'listrik',...
109703    ['seminar', 'nasional', 'himatikro', 'aktualis...
109704    ['pres', 'tarif', 'listrik', 'ebt', 'nanti', '...
Name: tweet, Length: 109705, dtype: object


In [4]:
# convert list formated string to list
import ast

def convert_text_list(texts):
    texts = ast.literal_eval(texts)
    return [text for text in texts]

twit["tweet_list"] = twit["tweet"].apply(convert_text_list)


print(twit["tweet_list"])

print("\ntype : ", type(twit["tweet_list"]))

0         [lomba, poster, ilmiah, energi, baru, deadline...
1               [elaahhh, ngomong, energi, baru, tv, biruu]
2         [daerah, pencil, membutuhkam, listrik, manfaat...
3         [tarik, materi, energi, baru, baek, ganti, nam...
4         [aneh, inget, jatropa, alias, minyak, jarak, b...
                                ...                        
109700    [bkpm, kembang, investasi, ebt, indonesia, ham...
109701    [chemation, heat, exchanger, energi, manfaat, ...
109702          [ebt, dadi, sumber, energi, listrik, utama]
109703    [seminar, nasional, himatikro, aktualisasi, pe...
109704    [pres, tarif, listrik, ebt, nanti, menteri, er...
Name: tweet_list, Length: 109705, dtype: object

type :  <class 'pandas.core.series.Series'>


In [5]:
def calc_TF(document):
    # Counts the number of times the word appears in review
    TF_dict = {}
    for term in document:
        if term in TF_dict:
            TF_dict[term] += 1
        else:
            TF_dict[term] = 1
    # Computes tf for each word
    for term in TF_dict:
        TF_dict[term] = TF_dict[term] / len(document)
    return TF_dict

twit["TF_dict"] = twit['tweet_list'].apply(calc_TF)

twit["TF_dict"].head(10)

0    {'lomba': 0.08333333333333333, 'poster': 0.083...
1    {'elaahhh': 0.16666666666666666, 'ngomong': 0....
2    {'daerah': 0.14285714285714285, 'pencil': 0.14...
3    {'tarik': 0.1111111111111111, 'materi': 0.1111...
4    {'aneh': 0.07692307692307693, 'inget': 0.07692...
5    {'eco': 0.1, 'power': 0.1, 'booster': 0.1, 'ka...
6    {'mahasiswa': 0.18181818181818182, 'tawur': 0....
7    {'misi': 0.07692307692307693, 'jalan': 0.07692...
8    {'seru': 0.09090909090909091, 'adik': 0.090909...
9    {'usaha': 0.125, 'nuklir': 0.125, 'salah': 0.1...
Name: TF_dict, dtype: object

In [6]:
# Check TF result
index = len(twit)-1

print('%20s' % "term", "\t", "TF\n")
for key in twit["TF_dict"][index]:
    print('%20s' % key, "\t", twit["TF_dict"][index][key])

                term 	 TF

                pres 	 0.125
               tarif 	 0.125
             listrik 	 0.125
                 ebt 	 0.125
               nanti 	 0.125
             menteri 	 0.125
               erick 	 0.125
               teken 	 0.125


In [7]:
def calc_DF(tfDict):
    count_DF = {}
    # Run through each document's tf dictionary and increment countDict's (term, doc) pair
    for document in tfDict:
        for term in document:
            if term in count_DF:
                count_DF[term] += 1
            else:
                count_DF[term] = 1
    return count_DF

DF = calc_DF(twit["TF_dict"])

In [8]:
DF

{'lomba': 229,
 'poster': 41,
 'ilmiah': 53,
 'energi': 48821,
 'baru': 41118,
 'deadline': 29,
 'januari': 104,
 'untuk': 19,
 'siswa': 24,
 'sma': 101,
 'smk': 28,
 'derajat': 30,
 'elaahhh': 1,
 'ngomong': 76,
 'tv': 207,
 'biruu': 2,
 'daerah': 1106,
 'pencil': 241,
 'membutuhkam': 1,
 'listrik': 12360,
 'manfaat': 4794,
 'tarik': 883,
 'materi': 118,
 'baek': 6,
 'ganti': 922,
 'nama': 260,
 'propasif': 1,
 'ajah': 16,
 'aneh': 55,
 'inget': 56,
 'jatropa': 1,
 'alias': 76,
 'minyak': 1544,
 'jarak': 40,
 'bbrp': 100,
 'digembar': 2,
 'gembor': 16,
 'ebt': 50625,
 'eco': 68,
 'power': 1298,
 'booster': 10,
 'karya': 283,
 'mahasiswa': 521,
 'universitas': 232,
 'nasional': 2460,
 'kembang': 14357,
 'tawur': 2,
 'misi': 113,
 'jalan': 1480,
 'usaha': 3602,
 'gas': 1732,
 'integrasi': 285,
 'dasar': 392,
 'prinsip': 141,
 'komersial': 128,
 'kuat': 933,
 'seru': 146,
 'adik': 27,
 'sd': 101,
 'pangudi': 1,
 'luhur': 58,
 'solo': 28,
 'praktek': 25,
 'bsama': 1,
 'sharp': 6,
 'nuklir

In [9]:
n_document = len(twit)

def calc_IDF(__n_document, __DF):
    IDF_Dict = {}
    for term in __DF:
        IDF_Dict[term] = np.log(__n_document / (__DF[term] + 1))
    return IDF_Dict
  
#Stores the idf dictionary
IDF = calc_IDF(n_document, DF)

In [10]:
#calc TF-IDF
def calc_TF_IDF(TF):
    TF_IDF_Dict = {}
    #For each word in the review, we multiply its tf and its idf.
    for key in TF:
        TF_IDF_Dict[key] = TF[key] * IDF[key]
    return TF_IDF_Dict

#Stores the TF-IDF Series
twit["TF-IDF_dict"] = twit["TF_dict"].apply(calc_TF_IDF)

In [11]:
# Check TF-IDF result
index = len(twit)-1

print('%20s' % "term", "\t", '%10s' % "TF", "\t", '%20s' % "TF-IDF\n")
for key in twit["TF-IDF_dict"][index]:
    print('%20s' % key, "\t", twit["TF_dict"][index][key] ,"\t" , twit["TF-IDF_dict"][index][key])


                term 	         TF 	              TF-IDF

                pres 	 0.125 	 0.6848351299952332
               tarif 	 0.125 	 0.6270779683178164
             listrik 	 0.125 	 0.2729060737735284
                 ebt 	 0.125 	 0.09666620834698497
               nanti 	 0.125 	 1.009898712432486
             menteri 	 0.125 	 0.36073826307133655
               erick 	 0.125 	 0.6426202400482611
               teken 	 0.125 	 0.7088320035570579


In [12]:
# sort descending by value for DF dictionary 
sorted_DF = sorted(DF.items(), key=lambda kv: kv[1], reverse=True)[:50]

# Create a list of unique words from sorted dictionay `sorted_DF`
unique_term = [item[0] for item in sorted_DF]

def calc_TF_IDF_Vec(__TF_IDF_Dict):
    TF_IDF_vector = [0.0] * len(unique_term)

    # For each unique word, if it is in the review, store its TF-IDF value.
    for i, term in enumerate(unique_term):
        if term in __TF_IDF_Dict:
            TF_IDF_vector[i] = __TF_IDF_Dict[term]
    return TF_IDF_vector

twit["TF_IDF_Vec"] = twit["TF-IDF_dict"].apply(calc_TF_IDF_Vec)

print("print first row matrix TF_IDF_Vec Series\n")
print(twit["TF_IDF_Vec"][0])

print("\nmatrix size : ", len(twit["TF_IDF_Vec"][0]))

print first row matrix TF_IDF_Vec Series

[0.0, 0.06746782617927817, 0.08177705360907206, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

matrix size :  50


In [13]:
# Convert Series to List
TF_IDF_Vec_List = np.array(twit["TF_IDF_Vec"].to_list())

# Sum element vector in axis=0 
sums = TF_IDF_Vec_List.sum(axis=0)

data = []

for col, term in enumerate(unique_term):
    data.append((term, sums[col]))
    
ranking = pd.DataFrame(data, columns=['term', 'rank'])
ranking.sort_values('rank', ascending=False)

Unnamed: 0,term,rank
0,ebt,5457.923631
1,energi,5445.268782
2,baru,4936.265252
3,energy,3919.499714
4,renewable,3802.270494
5,kembang,3786.339779
6,indonesia,3013.389142
7,listrik,3003.017515
8,pln,2957.979364
9,perintah,2734.515805


In [14]:
# join list of token as single document string
import ast

def join_text_list(texts):
    texts = ast.literal_eval(texts)
    return ' '.join([text for text in texts])
twit["tweet_join"] = twit["tweet"].apply(join_text_list)

twit["tweet_join"].head()

0    lomba poster ilmiah energi baru deadline janua...
1                 elaahhh ngomong energi baru tv biruu
2    daerah pencil membutuhkam listrik manfaat ener...
3    tarik materi energi baru baek ganti nama propa...
4    aneh inget jatropa alias minyak jarak bbrp jat...
Name: tweet_join, dtype: object

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# banyaknya term yang akan digunakan, 
# di pilih berdasarkan top max_features 
# yang diurutkan berdasarkan term frequency seluruh corpus
max_features = 1000

# Feature Engineering 
print ("------- TF-IDF on Tweet data -------")

tf_idf = TfidfVectorizer(max_features=max_features, binary=True)
tfidf_mat = tf_idf.fit_transform(twit["tweet_join"]).toarray()

print("TF-IDF ", type(tfidf_mat), tfidf_mat.shape)

------- TF-IDF on Tweet data -------
TF-IDF  <class 'numpy.ndarray'> (109705, 1000)


In [16]:
terms = tf_idf.get_feature_names()

# sum tfidf frequency of each term through documents
sums = tfidf_mat.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(terms):
    data.append((term, sums[col] ))

ranking = pd.DataFrame(data, columns=['term','rank'])
ranking.sort_values('rank', ascending=False)



Unnamed: 0,term,rank
226,ebt,9950.657169
248,energi,7372.402961
94,baru,6959.469129
249,energy,5223.962157
768,renewable,5102.922560
...,...,...
728,present,51.120963
462,komersial,50.853984
408,kab,50.216492
349,indikasi,49.241136


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000

# calc TF vector
cvect = CountVectorizer(max_features=max_features)
TF_vector = cvect.fit_transform(twit["tweet_join"])

# normalize TF vector
normalized_TF_vector = normalize(TF_vector, norm='l1', axis=1)

# calc IDF
tfidf = TfidfVectorizer(max_features=max_features, smooth_idf=False)
tfs = tfidf.fit_transform(twit["tweet_join"])
IDF_vector = tfidf.idf_

# hitung TF x IDF sehingga dihasilkan TFIDF matrix / vector
tfidf_mat = normalized_TF_vector.multiply(IDF_vector).toarray()

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000

# ngram_range (1, 3) to use unigram, bigram, trigram
cvect = CountVectorizer(max_features=max_features, ngram_range=(1,3))
counts = cvect.fit_transform(twit["tweet_join"])

normalized_counts = normalize(counts, norm='l1', axis=1)

tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(1,3), smooth_idf=False)
tfs = tfidf.fit_transform(twit["tweet_join"])

tfidf_mat = normalized_counts.multiply(tfidf.idf_).toarray()

In [19]:
tfidf.get_feature_names()



['acara',
 'ada',
 'ahli',
 'air',
 'ajak',
 'ajar',
 'akademisi',
 'akn',
 'akselerasi',
 'aku',
 'alam',
 'alas',
 'alih',
 'alih ebt',
 'alih energi',
 'alih energi baru',
 'alternatif',
 'amat',
 'ambil',
 'ana',
 'anak',
 'anak usaha',
 'ancam',
 'andal',
 'andal energi',
 'andal energi baru',
 'aneka',
 'aneka energi',
 'aneka energi baru',
 'anggap',
 'anggar',
 'anggota',
 'angin',
 'apec',
 'apresiasi',
 'arah',
 'arifin',
 'ars',
 'asal',
 'asia',
 'atap',
 'atas',
 'atur',
 'ayo',
 'baca',
 'badan',
 'badan usaha',
 'bagus',
 'bahan',
 'bahan bakar',
 'bahan bakar fosil',
 'bahas',
 'bahas energi',
 'bahas energi baru',
 'baik',
 'bakar',
 'bakar fosil',
 'baku',
 'bal',
 'banding',
 'bang',
 'banget',
 'bangkit',
 'bangkit ebt',
 'bangkit energi',
 'bangkit energi baru',
 'bangkit listrik',
 'bangkit listrik bas',
 'bangkit listrik ebt',
 'bangkit listrik energi',
 'bangkit listrik tenaga',
 'bangsa',
 'bangun',
 'bangun bangkit',
 'bangun bangkit ebt',
 'bangun bangkit lis

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import normalize

max_features = 1000


def generate_tfidf_mat(min_gram, max_gram):
    cvect = CountVectorizer(max_features=max_features, ngram_range=(min_gram, max_gram))
    counts = cvect.fit_transform(twit["tweet_join"])

    normalized_counts = normalize(counts, norm='l1', axis=1)

    tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(min_gram, max_gram), smooth_idf=False)
    tfs = tfidf.fit_transform(twit["tweet_join"])

    tfidf_mat = normalized_counts.multiply(tfidf.idf_).toarray()
    
    TF = normalized_counts.toarray()
    IDF = tfidf.idf_
    TF_IDF = tfidf_mat
    return TF, IDF, TF_IDF, tfidf.get_feature_names()

# ngram_range (1, 1) to use unigram only
tf_mat_unigram, idf_mat_unigram, tfidf_mat_unigram, terms_unigram = generate_tfidf_mat(1,1)

# ngram_range (2, 2) to use bigram only
tf_mat_bigram, idf_mat_bigram, tfidf_mat_bigram, terms_bigram = generate_tfidf_mat(2,2)

# ngram_range (3, 3) to use trigram only
tf_mat_trigram, idf_mat_trigram, tfidf_mat_trigram, terms_trigram = generate_tfidf_mat(3,3)

# ---------- check sparse data -------------------
idx_sample = 0

print("Show TFIDF sample ke-" + str(idx_sample), "\n")
print(twit["tweet"][idx_sample], "\n")

print("\t\t\t", "TF", "\t\t", "IDF", "\t\t", "TF-IDF", "\t", "Term\n")
for i, item in enumerate(zip(tf_mat_unigram[idx_sample], idf_mat_unigram, tfidf_mat_unigram[idx_sample], terms_unigram)):
    if(item[2] != 0.0):
        print ("array position " + str(i) + "\t", 
               "%.6f" % item[0], "\t", 
               "%.6f" % item[1], "\t", 
               "%.6f" % item[2], "\t", 
               item[3])

Show TFIDF sample ke-0 

['lomba', 'poster', 'ilmiah', 'energi', 'baru', 'deadline', 'januari', 'untuk', 'siswa', 'sma', 'smk', 'derajat'] 

			 TF 		 IDF 		 TF-IDF 	 Term

array position 95	 0.333333 	 1.981300 	 0.660433 	 baru
array position 246	 0.333333 	 1.809532 	 0.603177 	 energi
array position 523	 0.333333 	 7.171828 	 2.390609 	 lomba


In [21]:
def get_TF_unigram(row):
    idx = row.name
    return [tf for tf in tf_mat_unigram[idx] if tf != 0.0]

twit["TF_UNIGRAM"] = twit.apply(get_TF_unigram, axis=1)

def get_IDF_unigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_unigram[idx], idf_mat_unigram) if item[0] != 0.0]

twit["IDF_UNIGRAM"] = twit.apply(get_IDF_unigram, axis=1)

def get_TFIDF_unigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_unigram[idx] if tfidf != 0.0]

twit["TFIDF_UNIGRAM"] = twit.apply(get_TFIDF_unigram, axis=1)

twit[["tweet", "TF_UNIGRAM", "IDF_UNIGRAM", "TFIDF_UNIGRAM"]].head()

# save TFIDF Unigram to Excel

twit[["tweet", "TF_UNIGRAM", "IDF_UNIGRAM", "TFIDF_UNIGRAM"]].to_excel("TFIDF_Unigram.xlsx")

In [22]:
def get_TF_bigram(row):
    idx = row.name
    return [tf for tf in tf_mat_bigram[idx] if tf != 0.0]

twit["TF_BIGRAM"] = twit.apply(get_TF_bigram, axis=1)

def get_IDF_bigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_bigram[idx], idf_mat_bigram) if item[0] != 0.0]

twit["IDF_BIGRAM"] = twit.apply(get_IDF_bigram, axis=1)

def get_TFIDF_bigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_bigram[idx] if tfidf != 0.0]

twit["TFIDF_BIGRAM"] = twit.apply(get_TFIDF_bigram, axis=1)

def get_Term_bigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_bigram[idx], terms_bigram) if item[0] != 0.0]

twit["TWEET_BIGRAM"] = twit.apply(get_Term_bigram, axis=1)

twit[["TWEET_BIGRAM", "TF_BIGRAM", "IDF_BIGRAM", "TFIDF_BIGRAM"]].head()


# save TFIDF Bigram to Excel

twit[["TWEET_BIGRAM", "TF_BIGRAM", "IDF_BIGRAM", "TFIDF_BIGRAM"]].to_excel("TFIDF_Bigram.xlsx")

In [23]:
def get_TF_trigram(row):
    idx = row.name
    return [tf for tf in tf_mat_trigram[idx] if tf != 0.0]

twit["TF_trigram"] = twit.apply(get_TF_trigram, axis=1)

def get_IDF_trigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_trigram[idx], idf_mat_trigram) if item[0] != 0.0]

twit["IDF_trigram"] = twit.apply(get_IDF_trigram, axis=1)

def get_TFIDF_trigram(row):
    idx = row.name
    return [tfidf for tfidf in tfidf_mat_trigram[idx] if tfidf != 0.0]

twit["TFIDF_trigram"] = twit.apply(get_TFIDF_trigram, axis=1)

def get_Term_trigram(row):
    idx = row.name
    return [item[1] for item in zip(tf_mat_trigram[idx], terms_trigram) if item[0] != 0.0]

twit["TWEET_TRIGRAM"] = twit.apply(get_Term_trigram, axis=1)

twit[["TWEET_TRIGRAM", "TF_trigram", "IDF_trigram", "TFIDF_trigram"]].head()


# save TFIDF Trigram to Excel

twit[["TWEET_TRIGRAM", "TF_trigram", "IDF_trigram", "TFIDF_trigram"]].to_excel("TFIDF_Trigram.xlsx")

In [24]:
import json

def get_dict_feature_name(terms):
    feature_name = {}
    feature_name['feature'] = terms
    return feature_name


def save_to_json(Data, json_filename):
    with open(json_filename, mode='w') as json_config:
        json.dump(Data, json_config)
        
# save feature name unigram, bigram and trigram as json file
save_to_json( get_dict_feature_name(terms_unigram),"tfidf_feature_name_unigram.json")
save_to_json( get_dict_feature_name(terms_bigram),"tfidf_feature_name_bigram.json")
save_to_json( get_dict_feature_name(terms_trigram),"tfidf_feature_name_trigram.json")

In [25]:
twit.to_excel("TFIDF_result.xlsx")