In [None]:
!pip install google-play-scraper

In [None]:
from google_play_scraper import app
import pandas as pd
import numpy as np

In [None]:
from google_play_scraper import Sort, reviews

result, continuation_token = reviews(
    'com.', # Use apps id to scrape target apps
    lang='id', # Use your language
    country='id', # Use your country
    sort=Sort.MOST_RELEVANT,
    count=1000, # You can modify this with the amount of data you want to retrieve
    filter_score_with=None # You can use to filter with rating 1,2,3,4 or 5 or use None to get all
)

In [None]:
df_busu = pd.DataFrame(np.array(result), columns=['review'])
df_busu = df_busu.join(pd.DataFrame(df_busu.pop('review').tolist()))
df_busu.head()

In [None]:
len(df_busu.index)

In [None]:
df_busu[['userName', 'score', 'at', 'content']].head()

In [None]:
new_df = df_busu[['userName', 'score', 'at', 'content']]
sorted_df = new_df.sort_values(by='at', ascending=False)
sorted_df.head()

In [None]:
my_df = sorted_df[['userName', 'score', 'at', 'content']]

In [None]:
my_df = my_df[['content', 'score']]

In [None]:
my_df.head()

In [None]:
def labelling(score):
  if score < 3:
    return 'Negative'
  elif score == 4:
    return 'Positive'
  elif score == 5:
    return 'Positive'

my_df['Label'] = my_df['score'].apply(labelling)
my_df.head(50)

In [None]:
my_df.to_csv('scrapped_data.csv', index=False)

In [None]:
pd.set_option('display.max_column', None)
my_df = pd.read_csv('/content/scrapped_data.csv')
my_df.head(50)

In [None]:
my_df.info()

In [None]:
my_df.isna()

In [None]:
my_df.isna().any()

In [None]:
my_df.describe()

In [None]:
my_df.isnull().sum()

In [None]:
my_df.dropna(subset=['Label'], inplace = True)

In [None]:
my_df.isnull().sum()

In [None]:
my_df.head(50)

In [None]:
my_df.to_csv('name-apps.csv', index=False)

#Text Pre-Processing

In [None]:
df = pd.read_csv('/content/name-apps.csv')
df.head(50)

#Case Folding

In [None]:
import re
def clean_text(df, text_field, new_text_field_name):
  my_df[new_text_field_name] = my_df[text_field].str.lower()
  my_df[new_text_field_name] = my_df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))

  my_df[new_text_field_name] = my_df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
  return my_df

In [None]:
my_df['text_clean'] = my_df['content'].str.lower()
my_df['text_clean']
data_clean = clean_text(my_df, 'content', 'text_clean')
data_clean.head(10)

#Stopword Removal

In [None]:
import nltk.corpus
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('indonesian') # you can modify this to your language

data_clean['text_Stopword'] = data_clean['text_clean'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))
data_clean.head(50)

#Tokenizing

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
data_clean['text_tokens'] = data_clean['text_Stopword'].apply(lambda x: word_tokenize(x))
data_clean.head()

#Stemming (for Indonesian language)

In [None]:
!pip install Sastrawi

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
def stemmed_wrapper(term):
  return stemmer.stem(term)

term_dict = {}
hitung = 0

for document in data_clean['text_tokens']:
  for term in document:
    if term not in term_dict:
      term_dict[term] = ' '

print(len(term_dict))
print("------------------------")

for term in term_dict:
  term_dict[term] = stemmed_wrapper(term)
  hitung += 1
  print(hitung, ":", term, ":", term_dict[term])

print(term_dict)
print("------------------------")

def get_stemmed_term(document):
  return [term_dict[term] for term in document]

data_clean['text_steamindo'] = data_clean['text_tokens'].apply(lambda x:' '.join(get_stemmed_term(x)))
data_clean.head(20)

In [None]:
data_clean.to_csv('result_TextPreProcessing_nameapp.csv', index=False)

#TF-IDF

In [None]:
!pip install sklearn

In [None]:
def praproses(text):
  text.re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)()(?:-)?(?:\)\(|D|P)', text)
  text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))

  return text

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_clean['content'], data_clean['Label'], test_size=0.20, random_state=0)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

In [None]:
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(tfidf_train, y_train)

In [None]:
X_train.toarray()

In [None]:
y_pred = nb.predict(tfidf_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

clf = MultinomialNB()
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

print("MultinomialNB Accuracy:", accuracy_score(y_test, predicted))
print("MultinomialNB Precission:", precision_score(y_test, predicted, average="binary", pos_label="Negatif"))
print("MultinomialNB Recall:", recall_score(y_test, predicted, average="binary", pos_label="Negatif"))
print("MultinomialNB f1_score:", f1_score(y_test, predicted, average="binary", pos_label="Negatif"))

print(f'confussion_matrix:\n {confusion_matrix(y_test, predicted)}')
print('=====================================================\n')
print(classification_report(y_test, predicted, zero_division=0))

data_clean = pd.read_csv('result_TextPreProcessing_nameapp.csv')
