Skip to content
This repository has been archived by the owner on Jan 19, 2022. It is now read-only.

Commit

Permalink
Update step_one.py
Browse files Browse the repository at this point in the history
Fixed POS and Lemmatizer
  • Loading branch information
derekstephen committed Mar 21, 2020
1 parent 38985c6 commit a24189a
Showing 1 changed file with 10 additions and 9 deletions.
19 changes: 10 additions & 9 deletions step_one.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import pandas as pd
import nltk


# PREPARE DATA CODE


Expand All @@ -24,7 +25,7 @@ def prep_text(mission):


# First time download stop words
#nltk.download('stopwords')
nltk.download('stopwords')

# Load Stop Words
stop_words = stopwords.words('english')
Expand All @@ -44,6 +45,7 @@ def prep_text(mission):
# Remove Stop Words
df["WORD"] = df["WORD"].apply(lambda x: [item for item in x if item not in stop_words])


# END PREPARE DATA CODE

# START STEP ONE CODE
Expand All @@ -59,12 +61,12 @@ def get_wordnet_pos(treebank_tag):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
else: # Default Option
return wordnet.NOUN


# First time download wordnet
#nltk.download('wordnet')
nltk.download('wordnet')

# Create Porter Stemmer
stemmer = snowball.SnowballStemmer('english')
Expand All @@ -78,14 +80,13 @@ def get_wordnet_pos(treebank_tag):
# Create WordNet Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#Flatten POS to one list
# Flatten POS to one list
df["POS"] = df["POS"].apply(lambda column: [y for x in column for y in x])

# Lemmatization of Words
df["LEMMATIZATION"] = df["POS"].apply(lambda x: [wordnet_lemmatizer.lemmatize(pair[0], pos=get_wordnet_pos(pair[1])) if get_wordnet_pos(pair[1]) != '' else wordnet_lemmatizer.lemmatize(pair[0]) for sent in x for pair in sent])
df["LEMMATIZATION"] = df["POS"].apply(lambda x: [wordnet_lemmatizer.lemmatize(pair[0], pos=get_wordnet_pos(pair[1])) for pair in x])


# Lemmatization of Words
#df["LEMMATIZATION"] = df["POS"].apply(lambda x: [wordnet_lemmatizer.lemmatize(pair[0], pos=get_wordnet_pos(pair[1])) for sent in x for pair in sent])

# END STEP ONE CODE

Expand Down

0 comments on commit a24189a

Please sign in to comment.