In [None]:
!pip install pycaret

In [None]:
pip install --upgrade scikit-learn==0.23.2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
#import data and preprocess
import pandas as pd
import re
import string
from string import punctuation
from nltk.corpus import stopwords
stop_words = stopwords.words('english') # atau 'Indonesian'

#feature extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#modelling
from sklearn.model_selection import train_test_split
from sklearn import svm, naive_bayes
from sklearn.linear_model import LogisticRegression

#evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
#Read Text

dep = ['cuttingquote.xlsx', 'depressed1278.xlsx', 'depressingmsgs.xlsx', 'suicidalconcept.xlsx']
nondep = 'nondepressiontweet.xlsx'

def read_dep_files(files):
  for index, item in enumerate(files):
     if index == 1:
       dfs1 = pd.read_excel(item, usecols=[2])
     else:
       dfs2 = [pd.read_excel(item, usecols=[2], skipfooter=200) for index, item in enumerate(files) if index != 1]
  df = pd.concat(dfs2, ignore_index=True)
  df = pd.concat([df, dfs1], ignore_index=True)
  df.rename(columns={'text':'tweet'}, inplace=True)
  df['label'] = 1
  return df

def read_nondep_files(files):
    text = pd.read_excel(files)
    return text

In [None]:
df_depressed = read_dep_files(dep)
df_depressed

In [None]:
df_nondepressed = read_nondep_files(nondep)
df_nondepressed

In [None]:
df = pd.concat([df_depressed, df_nondepressed], ignore_index=True)
df['length'] = df.tweet.apply(len)
df.describe()

In [None]:
df.shape

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,6))
sns.countplot(df.label, palette='ch:7', edgecolor= 'white', linewidth=5)
sns.set(style='darkgrid')
plt.show()

### Data Cleansing

In [None]:
! pip install tweet-preprocessor 

import preprocessor as p

In [None]:
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 1000

In [None]:
# Bikin kolom baru berisi text_clean

df['tweet_clean'] = df['tweet']
df.head(50)

In [None]:
#Buat kolom length untuk length text_clean

df['tweet_clean_length'] = df['tweet_clean'].apply(len)
df.head(5)

#### Removing Link

In [None]:
# Remove link
import re

def deLink(text):
    regex_link = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", " ", text)
    return regex_link

df['tweet_clean'] = df['tweet_clean'].apply(deLink)
df.head(50)

#### Removing Space

In [None]:
#ganti '\n' dengan spasi

df['tweet_clean'] = [x.replace('\n', ' ') for x in df['tweet_clean']]
df.head(5)

#### Removing Punctuations

In [None]:
#Buang Tanda Baca

import string
string.punctuation

def message_cleaning(message):
    # test_punc_removed = [char for char in message if char not in string.punctuation]
    test_punc_removed = ''
    for char in message:
        if char not in string.punctuation:
            test_punc_removed += char
        else:
            test_punc_removed += " "
            
    test_punc_removed_join = ''.join(test_punc_removed)
    test_punc_removed_join_clean = [word for word in test_punc_removed_join.split()]
    test_punc_removed_join_clean = " ".join(test_punc_removed_join_clean)
    return test_punc_removed_join_clean

df['tweet_clean'] = df['tweet_clean'].apply(message_cleaning)
df.tail(50)

#### Removing Emoji

In [None]:
# Buang emoji
import re

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

df['tweet_clean'] = df['tweet_clean'].apply(deEmojify)
df.tail(50)

In [None]:
df['tweet_clean_length'].plot(bins=100, kind='hist') 

In [None]:
# summarize after clenasing
df.describe()

### Word Cloud

In [None]:
#Depressive Words

depressive_words = ' '.join(list(df[df['label'] == 1]['tweet_clean']))
depressive_wc = WordCloud(width = 612,height = 612, collocations=False, colormap="Blues").generate(depressive_words)
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(depressive_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

In [None]:
#Non-Depressive Words

depressive_words = ' '.join(list(df[df['label'] == 0]['tweet_clean']))
depressive_wc = WordCloud(width = 612,height = 612, collocations=False, colormap="Blues").generate(depressive_words)
plt.figure(figsize = (10, 8), facecolor = 'k')
plt.imshow(depressive_wc)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()

# Bag of Words

In [None]:
cv = CountVectorizer(lowercase = True, stop_words = stop_words, token_pattern="[A-Za-z]+")
# Ubah ke dalam bentuk BoW
BoW = cv.fit_transform(df['tweet_clean'])
BoW_df = pd.DataFrame(BoW.toarray(), columns=cv.get_feature_names())
BoW_df['target_cat'] = df.reset_index().label.map({0:0, 1:1})
BoW_df

In [None]:
from pycaret.classification import *
setup = setup(data=BoW_df, target='target_cat', session_id=123, train_size = 0.6, fold=5)

In [None]:
models = compare_models()

### Confussion Matrix (BoW)

In [None]:
lr = create_model('lr')
plot_model(lr, "confusion_matrix")

# TF - IDF

In [None]:
tv = TfidfVectorizer(lowercase = True, stop_words = stop_words, token_pattern="[A-Za-z]+")
# Ubah ke dalam bentuk TF-IDF
tf_idf = tv.fit_transform(df['tweet_clean'])
tf_idf_df = pd.DataFrame(tf_idf.toarray(), columns=tv.get_feature_names())
tf_idf_df['target_cat'] = df.reset_index().label.map({0:'nondepressed', 1:'depressed'})
tf_idf_df

In [None]:
from pycaret.classification import *
setup = setup(data=tf_idf_df, target='target_cat', session_id=123, train_size = 0.7, fold=10)

In [None]:
models = compare_models()

### Confussion Matrix TF-IDF

In [None]:
ridge_model = create_model('ridge')
plot_model(ridge_model, "confusion_matrix")

### Prediction

In [None]:
ridge_final = finalize_model(ridge_model)
ridge_final

In [None]:
text = ['There are no words for the pain of feeling so unwanted.']

In [None]:
#Ubah text ke dalam dataframe
text_transformed = cv.transform(text)
text_transformed_df = pd.DataFrame(text_transformed.toarray(), columns=cv.get_feature_names())

#Predict the text
prediction = predict_model(ridge_final, text_transformed_df)
prediction[['Label']]