# Using the Perceptron for NLP

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Makes a prediction for the perceptron
# Arguments re:
# f - the input features
# w - the model weights
# b - the model bias
def predict(f, w, b):
    # Calculate a = b + w1*f1 + w2*f2 + f3*f3 + ...
    a = sum(w*f)+b

    # The prediction is 1 when a >= 0, otherwise -1
    if (a >= 0):
        return 1
    else:
        return -1

# Calculates the accuracy of a model
# based on a testing dataframe
def evaluate(test_df, weight, bias, show_errors=False):
    return train_one_epoch(test_df, weight, bias, update=False, show_errors=show_errors)

# Fits a perceptron model
# Based on a training dataframe (train_df)
# The model weight and bias
# The weights and bias should be initialized to 0
# The train_df should have columns:
# - "x" (which is ignored)
# - "y" (which takes values -1 and 1)
# - other columns with 0/1 or False/True values, interpreted as features
def train_one_epoch(train_df, weight, bias, update=True, show_errors=False):
    incorrect = 0.0

    for index, row in train_df.iterrows():

        # Process the row in the df
        # And extract y and the features
        y = row["y"]
        token = row["x"]
        row = row.drop(["y", "x"])
        f = np.array(row.tolist())

        # Update the model as necessary
        # Also keep track of incorrect predictions
        # to evaluate the accuracy of the model
        if predict(f, weight, bias) != y:
            if show_errors:
                print(f"Incorrect prediction for {token}")
            incorrect += 1
            if update:
                weight += y*f
                bias += y

    accuracy = np.round(100 - incorrect/len(df)*100, 2)

    if update:
        return weight, bias, accuracy
    else:
        print(f"Accuracy = {accuracy}%")
        return accuracy

In [None]:
# Load in the dataset
# "Pride & Prejudice Nouns"
# All of the nouns in Pride and Prejudice, as identified by spaCy
# Along with the previous 3 words.
# Text taken from Project Gutenburg (so it includes the header and footer info)

df = pd.read_csv("https://raw.githubusercontent.com/Greg-Hallenbeck/HARP-210-NLP/main/datasets/PnP-nouns.csv")
# Just ignore this pre-processing. The file's just in a weird format
df["prev"] = df["prevN"] + " " + df["prev"]
del df["prevN"]
df.head(5)

Unnamed: 0,x,prev,y
0,judge,great uncle the,-1
1,father,did conquer her,-1
2,quest,the lane in,-1
3,book,conscientiously to his,-1
4,country,ladies in the,-1


In [None]:
# Create features
# I'll add more in the lecture
df["-s"] = df["x"].str.match(r".*s$") #does it end in s
df["-ss"] = df["x"].str.match(r".*ss$") #does it end in ss
df["people"] = df["x"].str.match(r"^people$") #is it the word people, which is plural
df["children"] = df["x"].str.match(r"^children$")
df["news"] = df["x"].str.match(r"^news$")
df["-i"] = df["x"].str.match(r".*i$")
df["-men"] = df["x"].str.match(r".*men$")#does it end in men
df["the"] = df["prev"].str.match(r".*\b(the)\b.*")
df["pv.sg"] = df["prev"].str.match(r".*\b(a|one|this|that)\b.*") # you know it'll be singular if the word is precedented with one of these
df["pv.pl"] = df["prev"].str.match(r".*\b(some|many|these|those|two|three|four)\b.*") #you know itll be plural

In [None]:
df.head()

Unnamed: 0,x,prev,y,-s,-ss,people,the,pv.sg,pv.pl
0,judge,great uncle the,-1,False,False,False,True,False,False
1,father,did conquer her,-1,False,False,False,False,False,False
2,quest,the lane in,-1,False,False,False,True,False,False
3,book,conscientiously to his,-1,False,False,False,False,False,False
4,country,ladies in the,-1,False,False,False,True,False,False


In [None]:
# Delete the extraneous column
del df["prev"]

In [None]:
# Initialize the weights and bias to 0
n_features = len(df.columns) -2
w = np.array([0]*n_features)
b = 0

In [None]:
# This will train a single epoch
w,b,a = train_one_epoch(df, w, b)

In [None]:
# What are our new weights and bias?
# How do we interpret them?
print(df.columns[2:])
print(w, b) #Ending in s is a 3, so its a strong indicator that itsplural, -5 is a strong
# indicator that ss is singular
#This is relative to the other ones, so if it was 3 and the other numbers were in the millions
#then it wouldnt be too strong of an indicator

# a 0 means it doesn't really effect the code

Index(['-s', '-ss', 'people', 'children', 'news', '-i', '-men', 'the', 'pv.sg',
       'pv.pl'],
      dtype='object')
[ 3 -4  3  2 -4  0  3  0  0  0] -1


In [None]:
# Evaluate the model, without updating it
# All weights will remain the same through entire set
# Turn on show_errors to print out every word you got wrong
evaluate(df, w, b, show_errors=True)

Incorrect prediction for press:--charles
Incorrect prediction for upstairs
Incorrect prediction for s
Incorrect prediction for adieus
Incorrect prediction for teeth
Incorrect prediction for corps
Incorrect prediction for corps
Incorrect prediction for mis
Incorrect prediction for genius
Incorrect prediction for press:--charles
Incorrect prediction for genius
Incorrect prediction for implied
Incorrect prediction for means
Incorrect prediction for stanzas
Incorrect prediction for means
Incorrect prediction for condemn
Incorrect prediction for virus
Incorrect prediction for corps
Incorrect prediction for data
Incorrect prediction for status
Incorrect prediction for means
Incorrect prediction for us;--was
Incorrect prediction for m^{rs
Incorrect prediction for mother.--yours
Incorrect prediction for means
Incorrect prediction for genius
Incorrect prediction for frost
Incorrect prediction for hitherto
Incorrect prediction for feet
Incorrect prediction for corps
Incorrect prediction for adie

99.7