In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import fasttext
import contractions
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
plt.xticks(rotation=70)
pd.options.mode.chained_assignment = None
pd.set_option('display.max_colwidth', 100)
%matplotlib inline

In [2]:
with open('kamus_useless.csv', encoding="utf8") as f:
    df = pd.read_csv(f)
f.close()

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,mix
1,1,dulatep
2,2,rosidi
3,3,syafri
4,4,hiro


In [4]:
for col in df.columns:
    print(col, df[col].isnull().sum())

Unnamed: 0 0
0 2


In [5]:
rws = df.loc[:, ['0']]

In [6]:
rws['0'] = rws['0'].astype(str)
rws['0'].head()

0        mix
1    dulatep
2     rosidi
3     syafri
4       hiro
Name: 0, dtype: object

In [7]:
rws['no_contract'] = rws['0'].apply(lambda x: [contractions.fix(word) for word in x.split()])
rws.head()

Unnamed: 0,0,no_contract
0,mix,[mix]
1,dulatep,[dulatep]
2,rosidi,[rosidi]
3,syafri,[syafri]
4,hiro,[hiro]


In [8]:
rws['fix_str'] = [' '.join(map(str, l)) for l in rws['no_contract']]
rws.head()

Unnamed: 0,0,no_contract,fix_str
0,mix,[mix],mix
1,dulatep,[dulatep],dulatep
2,rosidi,[rosidi],rosidi
3,syafri,[syafri],syafri
4,hiro,[hiro],hiro


In [9]:
pretrained_model = "lid.176.bin" 
model = fasttext.load_model(pretrained_model)
langs = []
for sent in rws['fix_str']:
    lang = model.predict(sent)[0]
    langs.append(str(lang)[11:13])
rws['langs'] = langs



In [10]:
rws.head()

Unnamed: 0,0,no_contract,fix_str,langs
0,mix,[mix],mix,pl
1,dulatep,[dulatep],dulatep,ru
2,rosidi,[rosidi],rosidi,it
3,syafri,[syafri],syafri,ms
4,hiro,[hiro],hiro,nl


In [11]:
rws['tokenized'] = rws['fix_str'].apply(word_tokenize)
rws.head()

Unnamed: 0,0,no_contract,fix_str,langs,tokenized
0,mix,[mix],mix,pl,[mix]
1,dulatep,[dulatep],dulatep,ru,[dulatep]
2,rosidi,[rosidi],rosidi,it,[rosidi]
3,syafri,[syafri],syafri,ms,[syafri]
4,hiro,[hiro],hiro,nl,[hiro]


In [12]:
rws['lower'] = rws['tokenized'].apply(lambda x: [word.lower() for word in x])
rws.head()

Unnamed: 0,0,no_contract,fix_str,langs,tokenized,lower
0,mix,[mix],mix,pl,[mix],[mix]
1,dulatep,[dulatep],dulatep,ru,[dulatep],[dulatep]
2,rosidi,[rosidi],rosidi,it,[rosidi],[rosidi]
3,syafri,[syafri],syafri,ms,[syafri],[syafri]
4,hiro,[hiro],hiro,nl,[hiro],[hiro]


In [13]:
punc = string.punctuation
rws['no_punc'] = rws['lower'].apply(lambda x: [word for word in x if word not in punc])
rws.head()

Unnamed: 0,0,no_contract,fix_str,langs,tokenized,lower,no_punc
0,mix,[mix],mix,pl,[mix],[mix],[mix]
1,dulatep,[dulatep],dulatep,ru,[dulatep],[dulatep],[dulatep]
2,rosidi,[rosidi],rosidi,it,[rosidi],[rosidi],[rosidi]
3,syafri,[syafri],syafri,ms,[syafri],[syafri],[syafri]
4,hiro,[hiro],hiro,nl,[hiro],[hiro],[hiro]


In [14]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\didin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\didin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\didin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
stop_words = set(stopwords.words('english'))
rws['stopwords_removed'] = rws['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
rws.head()

Unnamed: 0,0,no_contract,fix_str,langs,tokenized,lower,no_punc,stopwords_removed
0,mix,[mix],mix,pl,[mix],[mix],[mix],[mix]
1,dulatep,[dulatep],dulatep,ru,[dulatep],[dulatep],[dulatep],[dulatep]
2,rosidi,[rosidi],rosidi,it,[rosidi],[rosidi],[rosidi],[rosidi]
3,syafri,[syafri],syafri,ms,[syafri],[syafri],[syafri],[syafri]
4,hiro,[hiro],hiro,nl,[hiro],[hiro],[hiro],[hiro]


In [16]:
rws['pos_tags'] = rws['stopwords_removed'].apply(nltk.tag.pos_tag)
rws.head()

KeyboardInterrupt: 

In [None]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
rws['wordnet_pos'] = rws['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
rws.head()

Unnamed: 0,0,no_contract,fix_str,langs,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos
0,mix,[mix],mix,pl,[mix],[mix],[mix],[mix],"[(mix, NN)]","[(mix, n)]"
1,dulatep,[dulatep],dulatep,ru,[dulatep],[dulatep],[dulatep],[dulatep],"[(dulatep, NN)]","[(dulatep, n)]"
2,rosidi,[rosidi],rosidi,it,[rosidi],[rosidi],[rosidi],[rosidi],"[(rosidi, NN)]","[(rosidi, n)]"
3,syafri,[syafri],syafri,ms,[syafri],[syafri],[syafri],[syafri],"[(syafri, NN)]","[(syafri, n)]"
4,hiro,[hiro],hiro,nl,[hiro],[hiro],[hiro],[hiro],"[(hiro, NN)]","[(hiro, n)]"


In [None]:
wnl = WordNetLemmatizer()
rws['lemmatized'] = rws['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
rws.head()

Unnamed: 0,0,no_contract,fix_str,langs,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,mix,[mix],mix,pl,[mix],[mix],[mix],[mix],"[(mix, NN)]","[(mix, n)]",[mix]
1,dulatep,[dulatep],dulatep,ru,[dulatep],[dulatep],[dulatep],[dulatep],"[(dulatep, NN)]","[(dulatep, n)]",[dulatep]
2,rosidi,[rosidi],rosidi,it,[rosidi],[rosidi],[rosidi],[rosidi],"[(rosidi, NN)]","[(rosidi, n)]",[rosidi]
3,syafri,[syafri],syafri,ms,[syafri],[syafri],[syafri],[syafri],"[(syafri, NN)]","[(syafri, n)]",[syafri]
4,hiro,[hiro],hiro,nl,[hiro],[hiro],[hiro],[hiro],"[(hiro, NN)]","[(hiro, n)]",[hiro]


In [None]:
rws.to_csv('kamus_useless_transalation.csv')

In [None]:
df = pd.read_csv("kamus_useless_transalation.csv", encoding= 'unicode_escape')

#Then use iloc to select the column you want work on (in your case it seems 1st columns)

new_df = df[(df.iloc[:, 4]!="en")]

new_df.to_csv('kamus_useless_transalation_without_english.csv')


In [None]:
new_df['langs'].to_csv('lang.csv')

In [None]:
df = pd.read_csv("kamus_useless_transalation.csv", encoding= 'unicode_escape')

#Then use iloc to select the column you want work on (in your case it seems 1st columns)

new_df = df[(df.iloc[:, 4] == "en")]

new_df.to_csv('kamus_useless_transalation_with_english.csv')

In [None]:
df = pd.read_csv("kamus_useless_transalation.csv", encoding= 'unicode_escape')

#Then use iloc to select the column you want work on (in your case it seems 1st columns)

new_df = df[(df.iloc[:, 4] == "id")]

new_df.to_csv('kamus_useless_transalation_with_indonesia.csv')