# Gather and Label Train Data

This sheet is used to load, preprocess, and label data.

This sheet should be used in Google Collab

# Install and Import Packages

In [None]:
from google.colab import auth
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials
gc = gspread.authorize(GoogleCredentials.get_application_default())

In [None]:
pip install --upgrade "ibm-watson>=5.1.0"

In [None]:
pip install jsbeautifier

In [None]:
!pip install contractions

In [None]:
!pip install emoji

In [None]:
google_drive_path = "/content/drive/My Drive/catchfire/"
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Data Science Packages
import pandas as pd
import sklearn.linear_model
import tensorflow as tf
import numpy as np
import json
import string
from collections import Counter

# JSON
import jsbeautifier
opts = jsbeautifier.default_options()
opts.indent_size = 2

# IBM Watson
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

# API Calls
import requests
from requests.auth import HTTPBasicAuth

# NLP Packages
import re
import spacy
import nltk
nltk.download('stopwords')
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
from nltk.util import ngrams
import contractions 
from emoji import UNICODE_EMOJI


# Connect to Google Sheet and Download Data

In [None]:
wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1vmCtCwZqPtFIEewhM1CWT7C1QuwJ3gB8bbK8S9Qtwlg/edit?fbclid=IwAR2cIq9qDzU2Q0KyH_hiAmC7Ibm2P3o0rvO60nJau1R7rgL0O_E9UsGThP4#gid=1276133210')

In [None]:
tickers = ["JPM",
           "AAPL",
           "GOOG",
           "MSFT",
           "WMT",
           "AMZN",
           "DIS",
           "PFE",
           "XOM",
           "CVS",
           "PZZA",
           "TREE",
           "TMHC",
           "LOPE",
           "DISH",
           "CROX",
           "MMP",
           "THS",
           "ARMK",
           "BBY",
           "JLL",
           "BTC",
           "ETH",
           "XRP",
           "VET",
           "USDT",
           "ALGO",
           "XMR",
           "ATOM",
           "RVN",
           "AAVE",
           "BNB",
           "SPY",
           "UCXY",
           "SQQQ",
           "XLF",
           "GLD",
           "EEM",
           "XLE",
           "IWM",
           "QQQ",
           "SLV"]

data_frames = []
           
for i in range(0,len(tickers)):
  sheet = wb.worksheet(tickers[i])
  df = pd.DataFrame(sheet.get_all_values())
  new_header = df.iloc[0] #grab the first row for the header
  df = df[1:] #take the data less the header row
  df.columns = new_header #set the header row as the df header
  data_frames.append(df)

unlabeled_data = pd.concat(data_frames).reset_index()

In [None]:
unlabeled_data

In [None]:
feature_cols = [
  "Content"
]
unlabeled_data_feature = unlabeled_data[feature_cols]

# Preprocess Text for Labeling

This will be the same preprocessing for inference also.

In [None]:
def lower_case(text):
    return text.lower()

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def space_comma(text):
    pattern = ',' 
    text = re.sub(pattern, ' , ', text)
    return text

def space_emoji(text):
    try:
      # Wide UCS-4 build
      oRes = re.compile(u'(['
          u'\U0001F300-\U0001F64F'
          u'\U0001F680-\U0001F6FF'
          u'\u2600-\u26FF\u2700-\u27BF]+)', 
          re.UNICODE)
    except re.error:
      # Narrow UCS-2 build
      oRes = re.compile(u'(('
          u'\ud83c[\udf00-\udfff]|'
          u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
          u'[\u2600-\u26FF\u2700-\u27BF])+)', 
          re.UNICODE)
    text = oRes.sub(r'  \1  ', text) 
    return text  

def space_period(text):
    pattern = '\.' 

    if "." in text:
      last_position = text.rindex(".")
      text = text[:last_position] + '' + text[last_position + 1:]

    text = re.sub(pattern, ' ; ', text)
    return text

def consolidate_punc(text):
    pattern = r'[.!?,;\\]'

    if "!" in text:
      excl = True;
    else:
      excl = False;

    if "?" in text:
      question = True;
    else:
      question = False;

    text = re.sub(pattern, '', text)

    if (excl):
      text = text + " ! ";
    elif (question):
      text = text + " ? ";
    else:
      text = text + " . ";

    return text

def remove_chars(text):
    text = text.replace("[","")
    text = text.replace("]"," , ")
    text = text.replace("(","")
    text = text.replace(")"," , ")
    text = text.replace("&amp","")
    text = text.replace("&gt","")
    text = text.replace("&lt","")
    text = text.replace("*","")
    text = text.replace("-"," , ")
    text = text.replace("|"," ")
    text = text.replace(":"," ")
    text = text.replace("@"," ")
    text = text.replace("#"," ")
    text = text.replace("$"," ")
    text = text.replace("/"," ")
    text = text.replace("\\"," ")
    return text

def remove_numbers(text):
    pattern = r'\d+'
    text = re.sub(pattern, '', text)
    return text

def expand_contractions(text):
    text = contractions.fix(text)
    return text

tokenizer=ToktokTokenizer()

def tokenize(text):
    return tokenizer.tokenize(text)

def join_text(text):
    return " ".join(text)

def remove_dup(text):
    text = text.replace(", ,"," ,")
    text = text.replace("; ;"," ;")
    return text

# Make everything lowercase.
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(lower_case)
# Take out all links.
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(strip_links)
# Seperate commas out into seperate token. ',' -> ' , '
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(space_comma)
# Seperate periods out into seperate token and modify. '.' -> ' ; '
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(space_period)
# Seperate emojis out into seperate token and modify. 
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(space_emoji)
# Look for special punctuation and add punctuation to end of sentence
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(consolidate_punc)
# Remove certian characters.
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(remove_chars)
# Remove numbers.
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(remove_numbers)
# Expand Contractions
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(expand_contractions)
# Tokenize and Join Text. (Make uniform spaces)
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(tokenize)  
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(join_text)  
# Remove some duplication that could occur from preprocessing
unlabeled_data_feature['Content'] = unlabeled_data_feature['Content'].map(remove_dup)  

# Vectorize Words to Explore Vocab. 


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cvectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, ngram_range=(1, 2), max_features = 5000) 

bow_data_features = cvectorizer.fit_transform(unlabeled_data_feature['Content'])
bow_data_features = bow_data_features.toarray()

In [None]:
vocab = cvectorizer.get_feature_names()
vocab

In [None]:
data_item = 145555

print(unlabeled_data['Content'][data_item])
print(unlabeled_data_feature['Content'][data_item])

In [None]:
len(unlabeled_data_feature['Content'])

In [None]:
for i in range(130000,130100):
  print(unlabeled_data['Content'][i])
  print(unlabeled_data_feature['Content'][i])
  print("---------------------------------")

# Label Data

In [None]:
authenticator = IAMAuthenticator('-')
tone_analyzer = ToneAnalyzerV3(
    version='2021-03-26',
    authenticator=authenticator
)

tone_analyzer.set_service_url("https://api.us-south.tone-analyzer.watson.cloud.ibm.com/")

In [None]:
google_drive_path = "/content/drive/My Drive/catchfire/training_labeled_data_5_12/"
len(unlabeled_data_feature['Content'])

Go through and save labeled data in blocks. We need to process limited chunks of sentences at a time.

In [None]:
for block in range(1650,1651):
  text = ""
  content = unlabeled_data_feature['Content']

  for i in range(block*100,(block+1)*100):
    text = text + unlabeled_data_feature['Content'][i] + " "

  tone_analysis = tone_analyzer.tone(
    {'text': text},
    content_type='application/json',
    sentences = True
  ).get_result()

  filename = "label_5_11_"+ str(block) +".json"
  
  print(block)

  with open(google_drive_path+filename,"w") as json_file:
   json.dump(tone_analysis, json_file)

In [None]:
  filename = "label_5_11_"+ str(block) +".json"

  with open(google_drive_path+filename) as f:
    data = json.load(f)

  data

Load data back from saved labeled data.

In [None]:
label_data = pd.DataFrame(index=np.arange(1), columns=np.arange(8))
label_data.columns = ['text','anger','fear','joy','sadness','analytical','confident','tentative']

for block in range(0,1650):
  print(block)
  filename = "label_5_11_"+ str(block) +".json"

  with open(google_drive_path+filename) as f:
    data = json.load(f)

  for sent in data['sentences_tone']:
    to_append = {"text" : "",
                "anger" :0,
                "fear":0,
                "joy":0,
                "sadness":0,
                "analytical":0,
                "confident":0,
                "tentative":0} 

    to_append['text'] = sent['text'];
    for tone in sent['tones']:
        to_append[tone['tone_id']] = tone['score']

    label_data.loc[i] = to_append
    i = i+1

label_data['anger'] = label_data['anger'].astype(float)
label_data['fear'] = label_data['fear'].astype(float)
label_data['joy'] = label_data['joy'].astype(float)
label_data['sadness'] = label_data['sadness'].astype(float)
label_data['analytical'] = label_data['analytical'].astype(float)
label_data['confident'] = label_data['confident'].astype(float)
label_data['tentative'] = label_data['tentative'].astype(float)

label_data.describe()

Save to Excel File

In [None]:
label_data.to_excel(google_drive_path+"labeled_data.xlsx",
             sheet_name='Label_Data')  