In [None]:
import pandas as pd
import numpy as np
import glob
import random
from sklearn.model_selection import train_test_split

## Sampling for Gold label

In [4]:
# Get all filenames
filenames = []
for file in glob.glob("../Dataset/rawData1/*.txt"):
    filenames.append(file.split('\\')[-1].split('.')[0])

# Get sectors
masterfilepath = '../Dataset/SectorTickerData.xlsx'
data = pd.read_excel(masterfilepath)    
data['GSECTOR'].replace(np.nan,0, inplace=True)

# Get (ticker,sector) set
temp = set()
for i in range(len(data)):
    temp.add((data['Ticker'][i],data['GSECTOR'][i]))
set_ticker_sector = pd.DataFrame(temp,columns=['Ticker','Sector'])

# Curate for further use
sectors = list(pd.unique(data['GSECTOR']))
years = list(pd.unique([data['Date'][i]%10000 for i in range(len(data))]))

In [6]:
# Generate 3D array with filenames for given year(2017-20) in given sector
ticker_in_sectors = [[[] for _ in range(len(years)-2)] for _ in range(len(sectors))]
for i in filenames:
    tic = i.split('_')[0]
    year = int(i.split('_')[1])%10000
    if year != 2015 and year!=2016:
        idx_x = sectors.index(set_ticker_sector['Sector'][set_ticker_sector['Ticker'].tolist().index(tic)])
        idx_y = years.index(year)
        ticker_in_sectors[idx_x][idx_y].append(i)

In [8]:
# Choose two ticker per year from each sector
def get_gold_label_tickers(seed_used, tickers):
  random.seed(seed_used)
  tickers_to_be_used = []
  tickers_data = []
  for i in range(len(tickers)):
        for j in range(len(tickers[i])):
              if len(tickers[i][j])>2:
                idx = random.sample(tickers[i][j], 2)
                tickers_to_be_used.append(idx)
                for k in range(len(idx)):
                  tickers_data.append([sectors[i],idx[k].split('_')[0],years[j],idx[k]])
  pd.DataFrame(tickers_data,columns=['Sector','Ticker','Year','Filename']).to_excel('../Dataset/GoldLabels_TickerSectorMapping.xlsx')
  return tickers_to_be_used

In [None]:
gold_tickers = get_gold_label_tickers(1729,ticker_in_sectors)

In [None]:
# Save gold label data in different folder
for file_list in gold_tickers:
    for idx in range(len(file_list)):
        f = open('../Dataset/rawData1/'+file_list[idx]+'.txt',"r")
        copy = open('../Dataset/rawDataGoldLabel/'+file_list[idx]+'.txt',"wt")
        line = f.read()
        copy.write(str(line))
        f.close()
        copy.close()


## Sampling for BERT

### Training data for BERT-G

In [12]:
sentences = []
label = []

for file in glob.glob("../Dataset/GoldLabelledData/*.xlsx"):
  data = pd.read_excel(file)
  for i in range(len(data)):
    sentences.append(data['sentence'][i])
    label.append(data['Manual'][i])
df = pd.DataFrame(np.transpose([label,sentences]),columns=['label','text'])

In [13]:
# Take train-test-validation = 80-10-10 
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.10, random_state=1729)
X_train2, X_test2, y_train2, y_test2 = train_test_split(df['text'], df['label'], test_size=0.10, random_state=13832)
X_train3, X_test3, y_train3, y_test3 = train_test_split(df['text'], df['label'], test_size=0.10, random_state=110656)
X_train4, X_test4, y_train4, y_test4 = train_test_split(df['text'], df['label'], test_size=0.10, random_state=42)
X_train5, X_test5, y_train5, y_test5 = train_test_split(df['text'], df['label'], test_size=0.10, random_state=149)

In [14]:
# Slice validation data 
X_valid = X_train[:len(X_test)]
X_valid2 = X_train2[:len(X_test2)]
X_valid3 = X_train3[:len(X_test3)]
X_valid4 = X_train4[:len(X_test4)]
X_valid5 = X_train5[:len(X_test5)]

y_valid = y_train[:len(y_test)]
y_valid2 = y_train2[:len(y_test2)]
y_valid3 = y_train3[:len(y_test3)]
y_valid4 = y_train4[:len(y_test4)]
y_valid5 = y_train5[:len(y_test5)]

In [15]:
# Convert to dataframe 
train1 = pd.DataFrame(np.transpose([y_train[len(X_valid):],X_train[len(X_valid):]]),columns=['label','text'])
train2 = pd.DataFrame(np.transpose([y_train2[len(X_valid2):],X_train2[len(X_valid2):]]),columns=['label','text'])
train3 = pd.DataFrame(np.transpose([y_train3[len(X_valid3):],X_train3[len(X_valid3):]]),columns=['label','text'])
train4 = pd.DataFrame(np.transpose([y_train4[len(X_valid4):],X_train4[len(X_valid4):]]),columns=['label','text'])
train5 = pd.DataFrame(np.transpose([y_train5[len(X_valid5):],X_train5[len(X_valid5):]]),columns=['label','text'])

valid1 = pd.DataFrame(np.transpose([y_valid,X_valid]),columns=['label','text'])
valid2 = pd.DataFrame(np.transpose([y_valid2,X_valid2]),columns=['label','text'])
valid3 = pd.DataFrame(np.transpose([y_valid3,X_valid3]),columns=['label','text'])
valid4 = pd.DataFrame(np.transpose([y_valid4,X_valid4]),columns=['label','text'])
valid5 = pd.DataFrame(np.transpose([y_valid5,X_valid5]),columns=['label','text'])

test1 = pd.DataFrame(np.transpose([y_test,X_test]),columns=['label','text'])
test2 = pd.DataFrame(np.transpose([y_test2,X_test2]),columns=['label','text'])
test3 = pd.DataFrame(np.transpose([y_test3,X_test3]),columns=['label','text'])
test4 = pd.DataFrame(np.transpose([y_test4,X_test4]),columns=['label','text'])
test5 = pd.DataFrame(np.transpose([y_test5,X_test5]),columns=['label','text'])

In [16]:
# Storing the files
train1.to_csv('../Dataset/BERT-G/train1.csv', index=False)
valid1.to_csv('../Dataset/BERT-G/valid1.csv', index=False)
test1.to_csv('../Dataset/BERT-G/test1.csv', index=False)

train2.to_csv('../Dataset/BERT-G/train2.csv', index=False)
valid2.to_csv('../Dataset/BERT-G/valid2.csv', index=False)
test2.to_csv('../Dataset/BERT-G/test2.csv', index=False)

train3.to_csv('../Dataset/BERT-G/train3.csv', index=False)
valid3.to_csv('../Dataset/BERT-G/valid3.csv', index=False)
test3.to_csv('../Dataset/BERT-G/test3.csv', index=False)

train4.to_csv('../Dataset/BERT-G/train4.csv', index=False)
valid4.to_csv('../Dataset/BERT-G/valid4.csv', index=False)
test4.to_csv('../Dataset/BERT-G/test4.csv', index=False)

train5.to_csv('../Dataset/BERT-G/train5.csv', index=False)
valid5.to_csv('../Dataset/BERT-G/valid5.csv', index=False)
test5.to_csv('../Dataset/BERT-G/test5.csv', index=False)

### Training data for BERT-W

In [None]:
# To ensure Gold Labelled files are not included in BERT training data
filenames = []
for file in glob.glob("../Dataset/GoldLabelledData/*.xlsx"):
  filenames.append(file.split('\\')[1].split('.')[0])

# For further use
data = pd.read_excel('../Dataset/MasterFile.xlsx')
data['Sector'].replace(np.nan,0, inplace=True)
sectors = list(pd.unique(data['Sector']))
years = list(pd.unique([data['Date'][i]%10000 for i in range(len(data))]))

In [None]:
# Find tickers that belong to a given sector excluding the GoldLabelled Data
tickers_in_sector = [[] for _ in range(len(sectors))]
for i in range(len(data)):
    idx = sectors.index(data['Sector'][i])
    if data['Filename'][i] not in filenames:
      tickers_in_sector[idx].append(data['Ticker'][i])

# Find unique tickers
for i in range(len(tickers_in_sector)):
  tickers_in_sector[i] = pd.unique(tickers_in_sector[i])
count_of_ticker = [len(ticker_in_sectors[i]) for i in range(len(ticker_in_sectors))]

In [None]:
# Choose n% of random tickers from each sector depending on seed_used
def get_weak_label_tickers(seed_used, n, tickers):
  random.seed(seed_used)
  tickers_to_be_used = []
  for i in range(len(tickers)):
    idx = random.sample(tickers[i].tolist(), int(n*len(tickers[i])))
    tickers_to_be_used.append(idx)
  return tickers_to_be_used

In [None]:
# Take 50% of ticker from each sector
n = 0.5
tickers_to_be_used1 = get_weak_label_tickers(1729,n,ticker_in_sectors)
tickers_to_be_used2 = get_weak_label_tickers(13832,n,ticker_in_sectors)
tickers_to_be_used3 = get_weak_label_tickers(110656,n,ticker_in_sectors)
tickers_to_be_used4 = get_weak_label_tickers(42,n,ticker_in_sectors)
tickers_to_be_used5 = get_weak_label_tickers(149,n,ticker_in_sectors)

In [None]:
# Get equal number of inclaim and outclaim sentences from the files obtained using previous step
def get_training_data(tickers_to_be_used,save_file_as,files_used_list_saved_as):
  count=0

  # Select unique tickers to form training data
  ticker_set = set()
  for i in range(len(tickers_to_be_used)):
    for j in range(len(tickers_to_be_used[i])):
      ticker_set.add(tickers_to_be_used[i][j])

  sentences = []
  inclaim = []
  total_count=0
  local_count=[]
  files_used = []
  completed_ticker = set()

  # Get training data for given seed
  for file in glob.glob("../Dataset/WeakLabelledData/*.xlsx"):
      
      fileticker, fileyear = file.split('.')[-2].split('\\')[-1].split('_') 
      filename = file.split('\\')[-1]
      fileyear = int(fileyear)%10000

      # Ensure that tickers are not repeated and they belong to only the required set
      if (fileticker,fileyear) not in completed_ticker and fileticker in ticker_set:
        data2 = pd.read_excel(file)
        inclaim_num = np.count_nonzero(data2['Inclaim'])
        outclaim_num = len(data2) - inclaim_num 
        
        # To ensure equal number of inclaim and out of claim sentences
        count = min(inclaim_num,outclaim_num)
        inclaim_num = count
        outclaim_num = count

        for i in range(len(data2)):
          if data2['Inclaim'][i]>0 and inclaim_num>0:
            inclaim.append(1)
            sentences.append(data2['sentence'][i])
            inclaim_num-=1
            files_used.append(filename)
          elif outclaim_num>0:
            inclaim.append(0)
            sentences.append(data2['sentence'][i])
            outclaim_num-=1
            files_used.append(filename)
          elif inclaim_num==0 and outclaim_num==0:
            break
        local_count.append(2*count)
        total_count+=2*count
        completed_ticker.add((fileticker,fileyear))

  # Save training data in new file
  df = pd.DataFrame(np.transpose([inclaim,sentences]),columns=['label','text'])
  df.to_csv(save_file_as,index=False)
  pd.DataFrame(files_used,columns=['Filename']).to_csv(files_used_list_saved_as,index=False)

In [None]:
get_training_data(tickers_to_be_used1,'../Dataset/BERT-W/train1.csv','../Dataset/BERT-W/list1.csv')
get_training_data(tickers_to_be_used2,'../Dataset/BERT-W/train2.csv','../Dataset/BERT-W/list2.csv')
get_training_data(tickers_to_be_used3,'../Dataset/BERT-W/train3.csv','../Dataset/BERT-W/list3.csv')
get_training_data(tickers_to_be_used4,'../Dataset/BERT-W/train4.csv','../Dataset/BERT-W/list4.csv')
get_training_data(tickers_to_be_used5,'../Dataset/BERT-W/train5.csv','../Dataset/BERT-W/list5.csv')

### Training data BERT-WG

In [32]:
# Read training data of BERT-G
tg1 = pd.read_csv('../Dataset/BERT-G/train1.csv')
tg2 = pd.read_csv('../Dataset/BERT-G/train2.csv')
tg3 = pd.read_csv('../Dataset/BERT-G/train3.csv')
tg4 = pd.read_csv('../Dataset/BERT-G/train4.csv')
tg5 = pd.read_csv('../Dataset/BERT-G/train5.csv')

# Read training data of BERT-W
tw1 = pd.read_csv('../Dataset/BERT-W/train1.csv')
tw2 = pd.read_csv('../Dataset/BERT-W/train2.csv')
tw3 = pd.read_csv('../Dataset/BERT-W/train3.csv')
tw4 = pd.read_csv('../Dataset/BERT-W/train4.csv')
tw5 = pd.read_csv('../Dataset/BERT-W/train5.csv')

# Concatenate training data of BERT-G and BERT-W to form training data of BERT-WG
twg1 = pd.concat([tw1, tg1], ignore_index=True)
twg2 = pd.concat([tw2, tg2], ignore_index=True)
twg3 = pd.concat([tw3, tg3], ignore_index=True)
twg4 = pd.concat([tw4, tg4], ignore_index=True)
twg5 = pd.concat([tw5, tg5], ignore_index=True)

# Save the files
twg1.to_csv('../Dataset/BERT-WG/train1.csv',index=False)
twg2.to_csv('../Dataset/BERT-WG/train2.csv',index=False)
twg3.to_csv('../Dataset/BERT-WG/train3.csv',index=False)
twg4.to_csv('../Dataset/BERT-WG/train4.csv',index=False)
twg5.to_csv('../Dataset/BERT-WG/train5.csv',index=False)