In [1]:
### test
import sqlite3
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
con = sqlite3.connect("dbnews.db")

In [3]:
df = pd.read_sql_query("SELECT comment FROM t_comments", con)

In [4]:
df['comment']

0         WKWKWK, GAK SEKALIAN PAK, SELAWAT KE DPR... KA...
1         Mantab.. ini br dakwah yg sejati.. kl d tempat...
2                                     Salut buat Gus Miftah
3         yg nyinyir gak pernah lihat dan baca atw menge...
4         Ada adabnya sholawat. Lebih baik ajak ke majel...
5         Setiap ulama punya jalan dan cara dakwah masin...
6                                   Umpanin nocannya yak...
7                            Itulah ceramah yg sebenarnya. 
8                                   Besok bugil aja selawat
9         Bedanya Gus Miftah dengan FPI  / HTI bagaikan ...
10        Knp di tempat kerjanya? Mending diundang ke ma...
11        nah ini ulama beneran...diterima di semua kala...
12        Kerennn.... saya pernah dpt info seperti ini d...
13        Hebat Gus.Lanjutkan!inilah makna dakwah sebena...
14              Diskotik syariah jadunya nih wkwkwk..mantap
15        Niat dan tujuan yg baik pasti mendapat berkah ...
16        mantaaapppp..buat di gedung DP

## Preprocessing

In [5]:
import re

In [6]:
#drop duplicate data
data=df.drop_duplicates()
#mengubah ke huruf kecil
data['comment']=data['comment'].str.lower() 
#remove non aplabet
p = re.compile('[^a-zA-Z]')
data['comment'] = [p.sub(' ', x) for x in data['comment']]

In [7]:
data

Unnamed: 0,comment
0,wkwkwk gak sekalian pak selawat ke dpr ka...
1,mantab ini br dakwah yg sejati kl d tempat...
2,salut buat gus miftah
3,yg nyinyir gak pernah lihat dan baca atw menge...
4,ada adabnya sholawat lebih baik ajak ke majel...
5,setiap ulama punya jalan dan cara dakwah masin...
6,umpanin nocannya yak
7,itulah ceramah yg sebenarnya
8,besok bugil aja selawat
9,bedanya gus miftah dengan fpi hti bagaikan ...


## Tokenisasi

In [8]:
from nltk.tokenize import word_tokenize 

In [9]:
data['comment']=data['comment'].apply(lambda x: word_tokenize(x))

In [10]:
data['comment']

0         [wkwkwk, gak, sekalian, pak, selawat, ke, dpr,...
1         [mantab, ini, br, dakwah, yg, sejati, kl, d, t...
2                                [salut, buat, gus, miftah]
3         [yg, nyinyir, gak, pernah, lihat, dan, baca, a...
4         [ada, adabnya, sholawat, lebih, baik, ajak, ke...
5         [setiap, ulama, punya, jalan, dan, cara, dakwa...
6                                  [umpanin, nocannya, yak]
7                         [itulah, ceramah, yg, sebenarnya]
8                              [besok, bugil, aja, selawat]
9         [bedanya, gus, miftah, dengan, fpi, hti, bagai...
10        [knp, di, tempat, kerjanya, mending, diundang,...
11        [nah, ini, ulama, beneran, diterima, di, semua...
12        [kerennn, saya, pernah, dpt, info, seperti, in...
13        [hebat, gus, lanjutkan, inilah, makna, dakwah,...
14        [diskotik, syariah, jadunya, nih, wkwkwk, mantap]
15        [niat, dan, tujuan, yg, baik, pasti, mendapat,...
16        [mantaaapppp, buat, di, gedung

## Stemming

In [11]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [12]:
db = StemmerFactory()
stemmer = db.create_stemmer()

data['comment']=data['comment'].apply(lambda x : [stemmer.stem(y) for y in x])

In [13]:
data.head()

Unnamed: 0,comment
0,"[wkwkwk, gak, sekali, pak, selawat, ke, dpr, k..."
1,"[mantab, ini, br, dakwah, yg, sejati, kl, d, t..."
2,"[salut, buat, gus, miftah]"
3,"[yg, nyinyir, gak, pernah, lihat, dan, baca, a..."
4,"[ada, adab, sholawat, lebih, baik, ajak, ke, m..."


## Stopword Removal

In [None]:
from nltk.corpus import stopwords

In [None]:
list_stopwords = stopwords.words('indonesian')

In [None]:
list_stopwords.extend(["yg", "dg", "dgn", "ny", "d", "dh", 'atw', 'klo', 
                       'kalo', 'kl', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'g', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 'dr',
                       'jd', 'jgn','sdh', 'aja', 'ya', 'n', 't', 'nggak',
                       'hehe', 'wkwkwk', 'pen', 'u', 'nan', 'loh'])

In [None]:
list_stopwords = set(list_stopwords)

In [None]:
def stopwords_removal(words):
    return [word for word in words if not word in list_stopwords]

data['comment'] = data['comment'].apply(stopwords_removal)

In [None]:
data['comment'].head(10)

## Normalisation

In [14]:
slang=pd.read_csv("colloquial-indonesian-lexicon.csv")

In [15]:
slang=slang.iloc[:,0:2]

In [16]:
slang

Unnamed: 0,slang,formal
0,woww,wow
1,aminn,amin
2,met,selamat
3,netaas,menetas
4,keberpa,keberapa
5,eeeehhhh,eh
6,kata2nyaaa,kata-katanya
7,hallo,halo
8,kaka,kakak
9,ka,kak


In [17]:
normalizad_word_dict = {}

for index, row in slang.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

data['comment'] = data['comment'].apply(normalized_term)

In [18]:
data.head(100)

Unnamed: 0,comment
0,"[wkwkwk, enggak, sekali, pak, selawat, ke, dpr..."
1,"[mantab, ini, baru, dakwah, yang, sejati, kala..."
2,"[salut, buat, gus, miftah]"
3,"[yang, nyinyir, enggak, pernah, lihat, dan, ba..."
4,"[ada, adab, sholawat, lebih, baik, ajak, ke, m..."
5,"[tiap, ulama, punya, jalan, dan, cara, dakwah,..."
6,"[umpanin, nocannya, ya]"
7,"[itu, ceramah, yang, benar]"
8,"[besok, bugil, saja, selawat]"
9,"[beda, gus, miftah, dengan, fpi, hti, bagai, s..."


## Build model

In [19]:
import multiprocessing
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec

wiki = WikiCorpus('idwiki-latest-pages-articles.xml.bz2', lemmatize=False, dictionary={})
sentences = list(wiki.get_texts())
params = {'size': 200, 'window': 10, 'min_count': 10, 
          'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1E-3,}
word2vec = Word2Vec(sentences, **params)



In [29]:
Word2Vec("Sy")

RuntimeError: you must first build vocabulary before training the model

In [25]:
threshold = 0.6 # define a threshold
similar_words = {word:{} for word in data} # create a dictionary from root_words
for word in data:
    """        
    Loop over all words in root_words and create
    a temp dict with values above the threshold using the
    model.most_similar() method
    """
    temp_dict = dict(word2vec.most_similar(word))
    temp_dict = {k:v for k,v in temp_dict.items() if v > threshold}
    # append the temp dict to the similar_words dict
    similar_words[word] = temp_dict


def replace_words(text):
    """
    1.Loop over every word in the text (text is one row in the data 
    2.If the word is in root_words, simply append it to temp_text
    3.If not, then loop over all words in the similar_words dict, and
    check if the current word is in one of the sub dictionaries - if so,
    append the root_word to the temp_text
    4. Use the flags so we don't miss out on any words (e.g. there
    may be words that are not in the root_words list or in the
    similar_words sub dictionaries
    5. Return temp_text
    """
    temp_text = []
    for word in text:
        in_root_words_flag = False
        found_root_flag = False

        if word in root_words:
            temp_text.append(word)
            in_root_words_flag = True

        else:
            for root_word in similar_words:
                if word in similar_words[root_word]:
                    temp_text.append(root_word)
                    found_root_flag = True

        if in_root_words_flag == False and found_root_flag == False:
            temp_text.append(word)

    return temp_text

# apply the function above to your text data and create a new column
data['replaced_words'] = data["Adj_Addr"].get.apply(replace_words)

  if __name__ == '__main__':


KeyError: 'Adj_Addr'