# Dive into Abusive Language with Snorkel

Author: BingYune Chen 
<br>
Updated: 2021-08-02
<br><br>
Unlabeled Data is provided by [Twitter Archive](https://archive.org/details/twitterstream) 

----------

We use the collection of JSON grabbed from the general Twitter stream. The data is from the "Spritzer" version of the stream, which includes the a light and shallow Twitter grab. The data is provided for the purposes of research, history, testing, and memory.

In [None]:
# Imorts and setup for Google Colab 

# Mount Google Drive
from google.colab import drive ## module to use Google Drive with Python
drive.mount('/content/drive') ## mount to access contents

# Install python libraries
! pip install spacy-langdetect --quiet
! pip install -U pip setuptools wheel --quiet
! pip install -U spacy --quiet
! python -m spacy download en_core_web_sm --quiet
## use 'en_core_web_trf' for slower, more accurate pipeline

In [None]:
# Check GPU status on Google Colab
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
# Download the compressed files for dates 2020 Jan to Jun (.tar)
for m in ['01', '02', '03', '04', '05', '06']: 
## '01' - all, '02' - 10d, '03' - 12d, '04' - none, '05' - 7d, '06' - none
    for d in range(1, 31 + 1):

        if d < 10:
            file_num = '0' + str(d)
        else:
            file_num = d

        try:
            ! wget https://archive.org/download/archiveteam-twitter-stream-2020-{m}/twitter_stream_2020_{m}_{file_num}.tar
        except:
            pass


In [None]:
# Unzip the compressed files for dates 2020 Jan to Jun (.tar)
for m in ['01', '02', '03', '04', '05', '06']: ## months

    print('Starting month {}...'.format(m))

    for d in range(1, 31 + 1): ## days 31 + 1

        if d < 10:
            day_num = '0' + str(d)
        else:
            day_num = d

        print('Starting day {}...'.format(day_num))

        try:
            print('Extracting file...twitter_stream_2020_{}_{}.tar'.format(m, day_num))

            temp_tar = tarfile.open('twitter_stream_2020_{}_{}.tar'.format(m, day_num))
            temp_tar.extractall('./2020_q1q2')
            temp_tar.close()
        except:
            pass

In [None]:
# Download the compressed files for dates 2020 Jul to Dec (.zip)
for m in ['07', '08', '09', '10', '11', '12']: # all

    for d in range(1, 31 + 1):

        if d < 10:
            file_num = '0' + str(d)
        else:
            file_num = d

        try:
            ! wget https://archive.org/download/archiveteam-twitter-stream-2020-{m}/twitter-stream-2020-{m}-{file_num}.zip
        except:
            pass

In [None]:
# Unzip the compressed file for dates 2020 Jul to Dec (.zip)
for m in ['07', '08', '09', '10', '11', '12']: ## months 

    print('Starting month {}...'.format(m))

    for d in range(1, 31 + 1): ## days 31 + 1

        if d < 10:
            day_num = '0' + str(d)
        else:
            day_num = d

        print('Starting day {}...'.format(day_num))

        try:
            ! unzip twitter-stream-2020-{m}-{day_num}.zip
        except:
            pass

In [None]:
# Read tar files
import tarfile

# Imports for data and plotting
import pandas as pd
import numpy as np

# Imports for spaCy preprocessing
from spacy_langdetect import LanguageDetector
from spacy.language import Language
import spacy

nlp = spacy.load('en_core_web_sm')  

def create_lang_detector(nlp, name):
    return LanguageDetector()

Language.factory("language_detector", func=create_lang_detector)

nlp.add_pipe('language_detector', last=True) 

In [None]:
# Filter tweets for English for 2020-10-31 
for m in ['10']: ## months '07', '08', '09', '10', '11', '12' 

    print('Starting month {}...'.format(m))

    for d in range(31, 32): ## days 31 + 1

        if d < 10:
            day_num = '0' + str(d)
        else:
            day_num = d

        print('Starting day {}...'.format(day_num))

        try:
            
            for h in range(0, 24): ## hours 24

                if h < 10:
                    hr_num = '0' + str(h)
                else:
                    hr_num = h

                print('Starting hour {}...'.format(hr_num))

                %cd SAVE_PATH ## add correct file path to zip files

                for n in range(0, 60): ## minutes 60

                    if n < 10:
                        min_num = '0' + str(n)
                    else:
                        min_num = n

                    print('Starting minute {}...'.format(min_num))

                    try:
                        ! bunzip2 -k {min_num}.json.bz2 

                        hydrate_df = pd.read_json(
                            '{}.json'.format(min_num), lines=True
                            )
                        
                        hydrate_df['language'] = [
                            nlp(x)._.language['language'] for x in 
                            hydrate_df.text.fillna(" ") if x is not None
                        ]

                        temp_df = hydrate_df.loc[
                            hydrate_df['language'] == 'en', ['text']
                            ]
                    
                        print('Saving file 2020{}{}_{}_{}...'.format(
                            m, day_num, hr_num, min_num
                            )
                        )

                        temp_df.to_csv('2020{}{}_{}_{}.txt'.format(
                            m, day_num, hr_num, min_num
                            ), index=False
                        )
                    except:
                        pass
        except:
            pass

In [None]:
# Filter tweets for English for 2020-11-01 to 2020-11-06 
for m in ['11']: ## months '07', '08', '09', '10', '11', '12' 

    print('Starting month {}...'.format(m))

    for d in range(1, 7): ## days 31 + 1

        if d < 10:
            day_num = '0' + str(d)
        else:
            day_num = d

        print('Starting day {}...'.format(day_num))

        try:
            
            for h in range(0, 24): ## hours 24

                if h < 10:
                    hr_num = '0' + str(h)
                else:
                    hr_num = h

                print('Starting hour {}...'.format(hr_num))

                %cd SAVE_PATH ## add correct file path to zip files

                for n in range(0, 60): ## minutes 60

                    if n < 10:
                        min_num = '0' + str(n)
                    else:
                        min_num = n

                    print('Starting minute {}...'.format(min_num))

                    try:
                        ! bunzip2 -k {min_num}.json.bz2 

                        hydrate_df = pd.read_json(
                            '{}.json'.format(min_num), lines=True
                            )
                        
                        hydrate_df['language'] = [
                            nlp(x)._.language['language'] for x in 
                            hydrate_df.text.fillna(" ") if x is not None
                        ]

                        temp_df = hydrate_df.loc[
                            hydrate_df['language'] == 'en', ['text']
                            ]
                    
                        print('Saving file 2020{}{}_{}_{}...'.format(
                            m, day_num, hr_num, min_num
                            )
                        )

                        temp_df.to_csv('2020{}{}_{}_{}.txt'.format(
                            m, day_num, hr_num, min_num
                            ), index=False
                        )
                    except:
                        pass
        except:
            pass