In [19]:
def obtain_data(file):
    """
    :param file: input data file 
    :return: Dict in the form of {data:[[tweets],[sentiments]], x_set:{tweets}, y_set:{sentiments}}
    """
    data = []
    dictionary = {}
    allKeys = []
    allVals = []
    
    with open(file,'r',encoding="utf8") as f:
        lines = f.readlines()
        index=0
        for i in range (len(lines)-2000):

            if lines[i] == '\n':
                data.append(lines[index:i]) #append everything until it encounters a \n
                index = i+1
            lines[i] = lines[i].replace('\n','')# replace the \n at the end of each sentiments
            lines[i] = lines[i].split(' ') #split the line into their respective parts  

        #convert to keys and values (dict)
        for i in range(len(data)):
            data_values = data[i]
            for j in range(len(data_values)):
                if len(data_values[j]) > 2:
                    for k in range(1, len(data_values[j]) - 1):
                        data_values[j][0] += " "
                        data_values[j][0] += data_values[j][k]

            key = [word[0] for word in data_values] #tweet
            val = [word[-1] for word in data_values] #sentiment

            data[i] = [key,val]

        #data[i][0] gives the tweets
        for i in range(len(data)):
            for j in range(len(data[i][0])):
                allVals.append(data[i][1][j]) #appending the sentiments corresponding to the tweets
             
        for i in range(len(data)):
            for j in range(len(data[i][0])):
                #print(data[i][0][j]) #give each word
                allKeys.append(data[i][0][j])
        setKeys = set(allKeys)
        setVals = set(allVals)   

    return dict(data=data,x_set=setKeys,y_set=setVals)


#obtain_data('sg_train')

In [2]:
#e(x|y) = Count(y -> x)/Count(y)
#Count(y->x) means number of times you see x generated from y

import pandas as pd
import numpy as np

def calculate_emission_count(parsed_data):
    """
    :param parsed_data: input preprocessed dataset
    :return: emissions dataframe and count(y)
    ,where emissions dataframe is the count of the sentiments tagged to the tweet whilecount(y) is the number of times the sentiment appears
    """
    data = parsed_data['data']
    x_set = parsed_data['x_set']
    y_set = parsed_data['y_set']
    #create a new datafram of zeros with keys (ie.tweets) as the index and sentiments as the columns
    count_emissions_df = pd.DataFrame(np.zeros((len(x_set),len(y_set))),index=x_set,columns=y_set)
    count_y = pd.Series(np.zeros(len(y_set)),index=y_set) #create a series object of zeros with index as the sentiments => to store the number times the sentiments appear
    #print(count_y)
    #print(count_emissions_df) #datafram structure: where its tweets against columns of sentiments
    
    for word in data:
        #print(word) #format of data => [[keys],[values]]
        #keys are the tweets, values are the sentiments
        tweets_data,sentiments_data = word
        
        for i in range(len(tweets_data)):
            tweet,sentiment = tweets_data[i],sentiments_data[i] #associate the tweet with its sentiment
            #print(tweet,sentiment)
            #print(sentiment)
            #+1 to the row,col, given the tweet, sentiment freq +1
            count_emissions_df.loc[tweet,sentiment] += 1 #.loc[] access a grp of rows and columns by labels
            #count_emissions_df is for Count(y->x) [counting the number of times a sentiment wrt to the tweet]
            count_y[sentiment] += 1 #incrementing the number of time the respective sentiment appear
    return count_emissions_df,count_y
    
def get_emission_params(parsed_data):
    count_emissions_df,count_y = calculate_emission_count(parsed_data)
    return count_emissions_df/count_y #e(x|y), where x is the tweet, and y is the sentiment


em_df = get_emission_params(obtain_data('sg_train'))
#get_emission_counts(obtain_data('sg_train'))
em_df.head()


Unnamed: 0,B-negative,B-positive,O,I-positive,I-negative,I-neutral,B-neutral
denver,0.0,0.0,0.0,0.0,0.0,0.0,5.6e-05
@annebethasha,0.0,0.0,4e-06,0.0,0.0,0.0,0.0
#superpredators,0.0,0.0,4e-06,0.0,0.0,0.0,0.0
refs,0.0,0.0,1.7e-05,0.0,0.0,0.0,0.0
#TheFlash,0.0,0.000154,4e-06,0.0,0.0,0.0,0.0


In [15]:
def calculate_new_emission_counts(parsed_data,k):
    """
    :param parsed_data: input preprocessed dataset
    :param k:  number of occurrences
    :return: new dataframe with failed tweets replaced with #UNK# and count(y)
    """
    
    count_emissions_df,count_y = calculate_emission_count(parsed_data)
    #.sum(axis = 1(sum the column), axis = 0 (sum the index))
    count_tweet_appearance = count_emissions_df.sum(axis=1)
    #print(count_tweet_appearance) #counting the number of times each tweet appears by summing everything across the columns
    '''Output of count_tweet appearance eg. 
        seems                      15.0
        https://t.co/h6Ie4IBJ08     1.0
        #AnnaVonHausswolff          2.0
        Bowery                      2.0
        refuge                      2.0
        @chuckielufc                1.0
        https://t.co/7xSNeWemp1     1.0
        @chris_steller              3.0
        unexpected                  3.0
        #usantdp                    1.0
        Ones                        2.0
        1979                        4.0
        @joceltsh                   1.0
        @TomBoxingAsylum            2.0
        @thistletat13               2.0
        @eibeibb                    2.0
        @TalatHussain12             1.0
        Ilkeston                    2.0
        @ricosua                    1.0
        Belarus                     2.0
        charms                      2.0
        @EvermorSolution            2.0
        https://t.co/WrcuWKQ0Xg     2.0
        FIRED                       2.0'''
    
    failed_tweets = count_tweet_appearance[count_tweet_appearance<k]
    #print(failed_tweets)
    '''eg output if k<3 (ie. tweets with occurence less than 3 times) is:
        @Nandos                    1.0
        @rcmpgrcpolice             1.0
        #yas                       1.0
        @just                      2.0
        ford                       1.0
        attracted                  2.0
        @Unitetheunion             1.0
        .....        '''
    
    #replace the tweets that occur less than 1.0 with "#UNK#"
    #print(failed_tweets.index) #gives all the tweets that <1.0
    
    replace_tweets = count_emissions_df.loc[failed_tweets.index].sum(axis=0)
    replace_tweets.name  = '#UNK#'
    
    new_df = count_emissions_df.append(replace_tweets)
    new_df = new_df.drop(failed_tweets.index,axis=0) #drop all failed_tweets words
    headers = new_df.dtypes.index
    print(headers)
    #print(headers[2]) #gives sentiment O
    new_df.at['#UNK#',headers[2]] = 1.0
    #print(new_df) #without failed_tweets words inside, has #UNK# row inside at the bottom
    
    return new_df, count_y

def get_new_emission_params(parsed_data,k):
    """
    :param parsed_data: input preprocessed dataset
    :param k: number of occurrences
    :return: new emission params 
    """
    count_emissions_df,count_y = calculate_new_emission_counts(parsed_data,k)
    return count_emissions_df/count_y #e(x|y), where x is the tweet, and y is the sentiment

#calculate_new_emission_counts(obtain_data('sg_train'),3)
new_em_df_parameters = get_new_emission_params(obtain_data('sg_train'),1)
#new_em_df_parameters.sum(axis=1) #gives the sum of each rows (individual respective words)
'''eg.
Throwing          0.000012
headquarters      0.000300
insist            0.000012
except            0.000071
Broken            0.000128
LCD               0.000124
occur             0.000012
sound             0.000071
'''

new_em_df_parameters.sum(axis=0) #gives the counts of the sentiments
new_em_df_parameters.tail()

Index(['B-negative', 'B-positive', 'O', 'I-positive', 'I-negative',
       'I-neutral', 'B-neutral'],
      dtype='object')


Unnamed: 0,B-negative,B-positive,O,I-positive,I-negative,I-neutral,B-neutral
goreng,0.0,0.000154,0.0,0.000235,0.0,0.0,0.0
jäger,0.000292,0.0,0.0,0.0,0.0,0.0,0.0
Mane,0.0,0.0,0.0,0.000235,0.0,0.0,0.0
Tigre,0.0,0.0,0.0,0.0,0.0,6e-05,0.0
#UNK#,0.0,0.0,4e-06,0.0,0.0,0.0,0.0


In [4]:
def training_dataset(file):
    """
    :param file: input preprocessed dataset 
    :return: new emissions params given preprocessed data and k
    """
    dataset = obtain_data(file)
    k = 1
    return get_new_emission_params(dataset,k)

#single sentiment analysis for a word
def sentiment_analysis(emission_param,x):
    """
    :param emission_param: new emissions params dataframe,
    :param x: word that you want to predict the sentiment for 
    :return: predicted sentiment
    """
    #checking if the tweet is an undiscovered/discovered word
    #if the word does not appear in training set, then change it to #UNK#
    #print(emission_param.index) #gives the individual tweets
    if x not in emission_param.index:
        x = '#UNK#'
    probability = emission_param.loc[x,:]
    max_probability = None
    for col in probability.index:
        #print(col) #gives the sentiments labels
        '''B-positive
            ..
            ...
            -
            I-negative
            B-neutral
            242
            O
            477
            B-negative
            .
            I-positive
            I-neutral'''
        if max_probability is None : 
            max_probability = probability.loc[col]
            y = col
        elif probability.loc[col]>max_probability: #take the max prob
            max_probability = probability.loc[col]
            y = col #take the sentiment with the highest probability
        
    return y
        

def evaluation(filename,emission_param,outputfile):
    """
    :param filename: input datafile
    :param emission_param: emission param dataframe 
    """
    with open(filename,'r',encoding="utf8") as inputfile:
        lines = inputfile.readlines()
        lines = [line.replace('\n','') for line in lines]
        #print(lines)
        '''['best', 'friends', 'who', 'cry', 'on', 'FaceTime', 'together', ',', 'stay', 'together', '', "I'm", 'at', 'Starbucks',
        'in', 'Johor', 'Bahru', ',', 'Johor', 'w', '/', '@cassiecr17', 'https://t.co/3rzoTtjRag', '', 'Reports', 'of', 'a', 
        'collision', 'on', 'Friary', 'Road', 'in', 'Naas', 'https://t.co/MZgfLNdbyr', '', '♫', 'She', 'Moves', 'In', 'Her', 'Own' ......]
        '''
        
        for i in range(len(lines)):
            line = lines[i] #each individual tweets
            if line != '': #if line is not empty
                line = line + ' ' + sentiment_analysis(emission_param,line)
            line += '\n'
            lines[i] = line
            
        with open(outputfile,"w",encoding="utf8") as outputfile:
            for line in lines:
                outputfile.write(line)
    print("evaluation completed!")
            
        
        

In [11]:
modified_em_params = get_new_emission_params(obtain_data('SG\train'),1)
word = 'Experimental'
try:
    print(modified_em_params.loc[word]) #displays the probability of the sentiments tagged
except:
    word = '#UNK#' #if the tweet is replaced with #UNK#
    print(modified_em_params.loc[word]) #displays the probability of the sentiments tagged
sentiment_analysis(modified_em_params,word)

I-positive
B-negative    0.000000
B-positive    0.000000
O             0.000000
I-positive    0.000235
I-negative    0.000000
I-neutral     0.000000
B-neutral     0.000000
Name: #UNK#, dtype: float64


'I-positive'

In [None]:
#doing it for all 4 countries
for cty in ["CN","EN","FR","SG"]:
    emission_params = training_dataset(cty+"/train") #new emission params are also obtained inside here
    print("Analysing for " + cty + " dataset:")
    evaluation(cty+"/dev.in",emission_params,cty+"/dev.p2.out")


##### >python3 evalResult.py FR/dev.out FR/dev.p2.out
Entity in gold data: 238
Entity in prediction: 1111

Correct Entity : 180
Entity  precision: 0.1620
Entity  recall: 0.7563
Entity  F: 0.2669

Correct Entity Type : 75
Entity Type  precision: 0.0675
Entity Type  recall: 0.3151
Entity Type  F: 0.1112

##### >python3 evalResult.py EN/dev.out EN/dev.p2.out
Entity in gold data: 802
Entity in prediction: 1126

Correct Entity : 624
Entity  precision: 0.5542
Entity  recall: 0.7781
Entity  F: 0.6473

Correct Entity Type : 508
Entity Type  precision: 0.4512
Entity Type  recall: 0.6334
Entity Type  F: 0.5270

#### >python3 evalResult.py CN/dev.out CN/dev.p2.out
Entity in gold data: 1081
Entity in prediction: 5001

Correct Entity : 595
Entity  precision: 0.1190
Entity  recall: 0.5504
Entity  F: 0.1957

Correct Entity Type : 373
Entity Type  precision: 0.0746
Entity Type  recall: 0.3451
Entity Type  F: 0.1227

#### >python3 evalResult.py SG/dev.out SG/dev.p2.out
Entity in gold data: 4092
Entity in prediction: 12062

Correct Entity : 2398
Entity  precision: 0.1988
Entity  recall: 0.5860
Entity  F: 0.2969

Correct Entity Type : 1295
Entity Type  precision: 0.1074
Entity Type  recall: 0.3165
Entity Type  F: 0.1603