<a href="https://colab.research.google.com/github/donyeun/AMLSII_19-20_SN18154195/blob/master/AMLS_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Libraries and Variables

In [0]:
import pandas as pd
import os
import torch
from tqdm import tqdm
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import nltk
import csv

In [0]:
cfg = {
       'paths': {
           'train_folder_task_a': '/content/drive/My Drive/public/AMLSII_19-20_SN18154195/dataset/training/2017_English_final/GOLD/Subtask_A',
           'train_additional_dataset_filenames': ['livejournal-2014test-A.tsv', 'sms-2013test-A.tsv'],
           'test_file_task_a': '/content/drive/My Drive/public/AMLSII_19-20_SN18154195/dataset/testing/SemEval2017-task4-test/SemEval2017-task4-test.subtask-A.english.txt',

           'train_folder_task_b': '/content/drive/My Drive/public/AMLSII_19-20_SN18154195/dataset/training/2017_English_final/GOLD/Subtasks_BD',
           'test_file_task_b': '/content/drive/My Drive/public/AMLSII_19-20_SN18154195/dataset/testing/SemEval2017-task4-test/SemEval2017-task4-test.subtask-BD.english.txt',
       },
       'task_a': {
           'use_additional_dataset': True,
       },
       'preprocessing': {
           'tokenization': {
               'remove_twitter_handle': True,
               'preserve_case': False
           }
       },
}

# Load Dataset


## Preprocess Additional Dataset
In task A, there are some additional datasets that were given from the competition apart from the standard twitter corpus, which is sms and livejournal corpus. We can use both of these additional datasets, after we preprocess the formatting so that it matches the rest of the twitter datasets.

In [4]:
task_a_train_additional_dfs = {}

for filename in cfg['paths']['train_additional_dataset_filenames']:
  # read additional corpora
  task_a_train_additional_dfs[filename] = pd.read_csv(os.path.join(cfg['paths']['train_folder_task_a'], filename), sep='\t', header=None)
  
  if filename == 'livejournal-2014test-A.tsv':
    # remove the 0-th column from livejournal corpora, as it is unnecessary
    task_a_train_additional_dfs[filename] = task_a_train_additional_dfs[filename].drop(columns=[0])
  elif filename == 'sms-2013test-A.tsv':
    # remove the 1st column from sms corpora, as it is unnecessary
    task_a_train_additional_dfs[filename] = task_a_train_additional_dfs[filename].drop(columns=[1])
  
  # reset the column index to make it incremental
  task_a_train_additional_dfs[filename].columns = range(task_a_train_additional_dfs[filename].shape[1])
  
  print(task_a_train_additional_dfs[filename])

             0         1                                                  2
0     LJ111111  negative  I know I missed something here , but what does...
1     LJ111113   neutral  What do you think of Beside Ourselves as a tit...
2     LJ111114  positive                    :D I intend to be one someday .
3     LJ111117  negative  LLLINKKK LLLINKKK IIIMAGEEELLLINKKK The choice...
4     LJ111119   neutral                     LLLINKKK Some more mountains .
...        ...       ...                                                ...
1137  LJ113616  positive                     Maybe it was - his - fantasy ?
1138  LJ113618  negative  It was ok , but they always just seem so nervo...
1139  LJ113621  positive  It is streamable from YepRoc -- matter of fact...
1140  LJ113623  positive  comment telling me who you are , or how you fo...
1141  LJ113625   neutral  im on myspace ... ill try and find you and add...

[1142 rows x 3 columns]
          0         1                                          

## Make Datasets As Pandas' DataFrames



In [0]:
def append_txt_files_as_one_dataframe(folderpath, filename_keywords_list, additional_dataset_dfs=None):
  dataset_per_file_dfs = {}
  dataset_df = pd.DataFrame()
  filenames = os.listdir(folderpath)
  
  # open txt files (in tsv formatting)
  for filename in filenames:
    # if the filename contains a keyword in the filename_keywords_list, then open the txt file
    # this is to avoid opening unnecessary txt such as readme.txt file.
    if any(keyword in filename for keyword in filename_keywords_list):
      dataset_per_file_dfs[filename] = pd.read_csv(os.path.join(folderpath, filename), sep="\t", header=None, quoting=csv.QUOTE_NONE)
      print(dataset_per_file_dfs[filename].shape, filename)

  # combine the original dataset with additional dataset (if any)
  if additional_dataset_dfs is not None:
    for key, value in additional_dataset_dfs.items():
      dataset_per_file_dfs[key] = value

  # append all the files as one dataframe
  for key, value in dataset_per_file_dfs.items():
    # print(key, '\t', i, '\t', dataset_per_file_df[key].shape[0])
    dataset_df = dataset_df.append(dataset_per_file_dfs[key], ignore_index=True)
  return dataset_df

In [0]:
def clean_dataframe_format(df, new_column_name_list, drop_column_list=[]):
  df = df.drop(columns=drop_column_list)
  df.columns = new_column_name_list
  return df

In [9]:
if cfg['task_a']['use_additional_dataset']:
  task_a_train_df = append_txt_files_as_one_dataframe(cfg['paths']['train_folder_task_a'], ['twitter'], task_a_train_additional_dfs)
else:
  task_a_train_df = append_txt_files_as_one_dataframe(cfg['paths']['train_folder_task_a'], ['twitter'])
task_a_train_df = clean_dataframe_format(task_a_train_df, ['id', 'sentiment', 'text'], drop_column_list=[3])
task_a_train_df

(2000, 3) twitter-2016devtest-A.txt
(1999, 3) twitter-2016dev-A.txt
(6000, 3) twitter-2016train-A.txt
(1654, 3) twitter-2013dev-A.txt
(3547, 3) twitter-2013test-A.txt
(9684, 3) twitter-2013train-A.txt
(1853, 3) twitter-2014test-A.txt
(2390, 3) twitter-2015test-A.txt
(489, 3) twitter-2015train-A.txt
(86, 3) twitter-2014sarcasm-A.txt
(20633, 4) twitter-2016test-A.txt


Unnamed: 0,id,sentiment,text
0,637641175948763136,neutral,@SeeMonterey LOST - Sony cell phone with holid...
1,637651487762554881,neutral,"@PersonaSoda well yeah, that's third parties. ..."
2,637666734300905472,negative,Sony rewards app is like a lot of 19 y.o femal...
3,637668142110654468,neutral,@fakethom Have android tab and don't use phone...
4,637708370129125377,positive,Finally I get my ps4 back I sent it to Sony ca...
...,...,...,...
53566,10038,negative,Oki... Think i\u2019m confused... I only know ...
53567,11799,neutral,Yup... Ok i go home look at the timings then i...
53568,11945,neutral,Here got lots of hair dresser fr china.
53569,10154,neutral,no alh we are not discussing fromt he viewpoi...


In [12]:
task_b_train_df = append_txt_files_as_one_dataframe(cfg['paths']['train_folder_task_b'], ['twitter'])
task_b_train_df = clean_dataframe_format(task_b_train_df, ['id', 'topic','sentiment', 'text'], drop_column_list=[4])
task_b_train_df

(4346, 4) twitter-2016train-BD.txt
(1325, 4) twitter-2016dev-BD.txt
(1417, 4) twitter-2016devtest-BD.txt
(489, 4) twitter-2015train-BD.txt
(10552, 5) twitter-2016test-BD.txt
(2383, 5) twitter-2015testBD.txt


Unnamed: 0,id,topic,sentiment,text
0,628949369883000832,@microsoft,negative,dear @Microsoft the newOoffice for Mac is grea...
1,628976607420645377,@microsoft,negative,@Microsoft how about you make a system that do...
2,629023169169518592,@microsoft,negative,I may be ignorant on this issue but... should ...
3,629179223232479232,@microsoft,negative,"Thanks to @microsoft, I just may be switching ..."
4,629226490152914944,@microsoft,positive,"Microsoft, I may not prefer your gaming branch..."
...,...,...,...,...
20507,520957142816530432,younique,positive,Check out the gorgeous products from Younique!...
20508,522889214036156417,younique,neutral,"""LADIES! BULK ORDER GOES IN TOMORROW AT 10PM ..."
20509,521416045865992192,younique,positive,"""Here is something fun for you: On the 14th, ..."
20510,522912399322394624,younique,positive,Get the best mascara in the industry here! Im ...


In [30]:
# task_b_train_df[
#                 (task_b_train_df['sentiment'] != 'negative') &
#                 (task_b_train_df['sentiment'] != 'positive') &
#                 (task_b_train_df['sentiment'] != 'neutral')
#                 ]

# task_b_train_df

task_b_train_df[task_b_train_df['text'].str.len()< 40]

Unnamed: 0,id,topic,sentiment,text
162,641097610040877056,ac/dc,positive,Someone come see AC/DC with me Thursday
882,640504136769732608,bentley,positive,Bringing the Bentley out tomorrow.
887,640776304170782720,bentley,positive,Ready to be with Bentley tomorrow.
1073,630580275106480129,bob marley,positive,#np Bob Marley - the sun is shining
1594,622258302664228864,digi,positive,digi is tomorrow and im not going ):::
...,...,...,...,...
20014,523081396852903936,school,negative,School tomorrow more like fuck you.
20015,523042458091790336,school,negative,School on Monday! I don't want!!!! :(
20016,522976475633700864,school,negative,School just isn't happening tomorrow
20400,522987022265643008,white house,neutral,Going by the White house tomorrow


In [0]:
task_a_test_df = pd.read_csv(cfg['paths']['test_file_task_a'], sep='\t', header=None)
task_a_test_df = clean_dataframe_format(task_a_test_df, ['id', 'sentiment', 'text'])
task_a_test_df

In [0]:
task_b_test_df = pd.read_csv(cfg['paths']['test_file_task_b'], sep='\t', header=None)
task_b_test_df = clean_dataframe_format(task_b_test_df, ['id', 'topic', 'sentiment', 'text'])
task_b_test_df

# Exploratory Data Analysis (EDA)

# Tokenization

In [0]:
tokenizer = nltk.tokenize.TweetTokenizer(
    strip_handles = cfg['preprocessing']['tokenization']['remove_twitter_handle'],
    preserve_case = cfg['preprocessing']['tokenization']['preserve_case']
)

def tokenize_text(text, tokenizer):
  tokenized_text = tokenizer.tokenize(
      text,
    )
  return tokenized_text

# Tokenisation Example
text = task_a_train_df.head(1)['text'].values[0]
print('original text : ', text)
print('tokenized text: ', tokenizer.tokenize(text))

In [0]:
task_a_train_df['tokenized'] = task_a_train_df.apply(
    lambda row: tokenize_text(
        row['text'],
        tokenizer
    ),
    axis=1
)

In [0]:
task_a_train_df

## Word Cloud

In [0]:
def show_wordcloud(text, title, max_words=200):
  wc = WordCloud(
      max_words=max_words,
      background_color='white',
      # ranks_only= frequency
  ).generate(text)

  print(wc.words_)

  fig = plt.figure(1, figsize=(12, 12))
  fig.suptitle(title)
  plt.imshow(wc)
  plt.show()

In [0]:
df = task_b_train_df[task_b_train_df['sentiment'] == 'positive']['text'][4]
print(df)
# print(df.shape)
# show_wordcloud(str(df.values), 'judul', max_words=20000)

# print(df.shape)
word_string = "you verse wrote book stand titlea i you you you kampung"
print(type(word_string))
show_wordcloud(word_string, 'judul', max_words=20000)

In [0]:
task_a_train_df['sentiment'].value_counts().plot(kind='bar',figsize=(7,4));
plt.title('Number of tweets');
plt.xlabel('Sentiment');
plt.ylabel('Total tweets');

In [0]:
task_b_train_df[task_b_train_df['text'].str.contains('not')]

In [0]:
positive = task_b_train_df[task_b_train_df['sentiment'] == 'positive']['topic'].value_counts(sort=False).to_frame(name='positive')
negative = task_b_train_df[task_b_train_df['sentiment'] == 'negative']['topic'].value_counts(sort=False).to_frame(name='negative')
neutral = task_b_train_df[task_b_train_df['sentiment'] == 'neutral']['topic'].value_counts(sort=False).to_frame(name='neutral')
# negative = task_b_train_df['topic'][task_b_train_df['sentiment'] == 'negative'].value_counts(sort=False)
# neutral = task_b_train_df['topic'][task_b_train_df['sentiment'] == 'neutral'].value_counts(sort=False)
joined = positive.join(negative).join(neutral)

joined = joined[(joined['positive'] > 0) & (joined['negative']>0)]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(joined)

plt.bar(range(joined.shape[0]), joined['positive'], label='positive', color='blue')
plt.bar(range(joined.shape[0]), joined['negative'], label='negative', color='red', bottom=joined['positive'])

In [0]:
# save dataset as csv file
task_b_train_df.to_csv('coba_train_b', sep='\t')