In [945]:
#import dependencies

import nltk
import random
import numpy as np
nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize, casual_tokenize, PunktSentenceTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd




showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [946]:
stop_words = set(stopwords.words("english"))
stop_words


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [947]:
#import training data CSV

train_df = pd.read_csv("nlp-getting-started/train - hand cleaned.csv")

In [948]:
example_text = "Theres a fire in the Catalinas. Looks kinda cool. This picture doesnt do it justice. https://t.co/N0tAwGeZJx"

print(sent_tokenize(example_text))

['Theres a fire in the Catalinas.', 'Looks kinda cool.', 'This picture doesnt do it justice.', 'https://t.co/N0tAwGeZJx']


In [949]:
print(word_tokenize(example_text))

['Theres', 'a', 'fire', 'in', 'the', 'Catalinas', '.', 'Looks', 'kinda', 'cool', '.', 'This', 'picture', 'doesnt', 'do', 'it', 'justice', '.', 'https', ':', '//t.co/N0tAwGeZJx']


In [950]:
filtered_text = []
for element in word_tokenize(example_text):
    if element not in stop_words:
        print(element)

Theres
fire
Catalinas
.
Looks
kinda
cool
.
This
picture
doesnt
justice
.
https
:
//t.co/N0tAwGeZJx


In [951]:
tweet_text = train_df['text']
tweet_text

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to shelter in place are be...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [952]:
# word_tokenize(tweet_text)

In [953]:
#Use Pandas and RegEx to copy hashtags, mentions, and urls and store them each in a list in each their separate columns
train_df["hashtags"] = train_df['text'].str.findall(r'([#]\S+)')
train_df["mentions"] = train_df['text'].str.findall(r'([@]\S+)')
train_df["urls"] = train_df['text'].str.findall(r'(http:\/\/\S+)')

#Effectively deletes mentions and urls with Pandas and Regex by replacing them with an empty string as they are probably not particularly useful in NLP for Tweets
train_df["text"] = train_df["text"].str.replace('([@]\S+)', '', regex=True) #deletes mentions
train_df["text"] = train_df["text"].str.replace('((https?:\/\/)\S+)', '', regex=True) #deletes URLs

#Replaces the # symbol with an empty space to turn hashtags into regular words
train_df["text"] = train_df["text"].str.replace('[#:\n]', '', regex=True) #deletes the hashtage symbol only

#Replaces non ASCII words and symbols like: Â‰Ã›Ã with blank spaces
train_df["text"] = train_df["text"].str.replace('[^\x00-\x7F]+', '', regex=True) #deletes non English alphabet characters



In [954]:
tweet_text

0       Our Deeds are the Reason of this earthquake Ma...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to shelter in place are be...
3       13,000 people receive wildfires evacuation ord...
4       Just got sent this photo from Ruby Alaska as s...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609      The out of control wild fires in California ...
7610           M1.94 [0104 UTC]?5km S of Volcano Hawaii. 
7611    Police investigating after an e-bike collided ...
7612    The Latest More Homes Razed by Northern Califo...
Name: text, Length: 7613, dtype: object

In [955]:
train_df.head(50)

Unnamed: 0,id,keyword,location,text,target,hashtags,mentions,urls
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1,[#earthquake],[],[]
1,4,,,Forest fire near La Ronge Sask. Canada,1,[],[],[]
2,5,,,All residents asked to shelter in place are be...,1,[],[],[]
3,6,,,"13,000 people receive wildfires evacuation ord...",1,[#wildfires],[],[]
4,7,,,Just got sent this photo from Ruby Alaska as s...,1,"[#Alaska, #wildfires]",[],[]
5,8,,,RockyFire Update => California Hwy. 20 closed ...,1,"[#RockyFire, #CAfire, #wildfires]",[],[]
6,10,,,flood disaster Heavy rain causes flash floodin...,1,"[#flood, #disaster]",[],[]
7,13,,,Im on top of the hill and I can see a fire in ...,1,[],[],[]
8,14,,,Theres an emergency evacuation happening now i...,1,[],[],[]
9,15,,,Im afraid that the tornado is coming to our ar...,1,[],[],[]


In [956]:
#reference
train_df.iloc[100,:]

id                                                        144
keyword                                              accident
location                                                   UK
text        . Bahrain police had previously died in a road...
target                                                      1
hashtags                                           [#Bahrain]
mentions                                         [@NorwayMFA]
urls                                                       []
Name: 100, dtype: object

In [957]:
test_tweet_46 = word_tokenize(tweet_text[46])

In [958]:
for element in word_tokenize(train_df['text'][46]):
    print(nltk.ne_chunk(nltk.pos_tag([element])))
    # print(nltk.ne_chunk([element]))

(S How/WRB)
(S the/DT)
(S (GPE West/NNP))
(S was/VBD)
(S burned/VBN)
(S Thousands/NNS)
(S of/IN)
(S wildfires/NNS)
(S ablaze/NN)
(S in/IN)
(S (GPE California/NNP))
(S alone/RB)


In [959]:
train_df.iloc[46,:]

id                                                         66
keyword                                                ablaze
location                            GREENSBORO,NORTH CAROLINA
text        How the West was burned Thousands of wildfires...
target                                                      1
hashtags                                                   []
mentions                                                   []
urls                                 [http://t.co/vl5TBR3wbr]
Name: 46, dtype: object

In [960]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("worse", pos = "a"))


bad


In [961]:
#turns the text and the target from the df into a list of tuples
documents = []
for index, row in train_df.iterrows():
    # print(word_tokenize(row['text']), row['target'])
    documents.append((word_tokenize(row['text']), row['target']))

print(documents)




In [962]:
word_tokenize(tweet_text[46])

['How',
 'the',
 'West',
 'was',
 'burned',
 'Thousands',
 'of',
 'wildfires',
 'ablaze',
 'in',
 'California',
 'alone']

In [963]:
all_words = []    #p.11
for tweet in train_df["text"]:
    tweet = tweet.split()
    for word in tweet:
        all_words.append(word.lower())


In [964]:
all_words = nltk.FreqDist(all_words)
all_words

FreqDist({'the': 3233, 'a': 2139, 'in': 1949, 'to': 1933, 'of': 1813, 'and': 1400, 'i': 1348, 'is': 933, 'for': 882, 'on': 837, ...})

In [973]:
word_features = list(all_words)[:500] #p.12

In [966]:
random.shuffle(documents)
def find_features(documents):
    words = set(documents)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features

# print((find_features(test_tweet_46)))

In [967]:
# featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [968]:
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

# print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
print(movie_reviews.words('neg/cv000_29416.txt'))

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Goutham\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [974]:
# print(movie_reviews.words('neg/cv000_29416.txt'))
# print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
# find_features(['a','the'])****

find_features(test_tweet_46)



{'the': True,
 'a': False,
 'in': True,
 'to': False,
 'of': True,
 'and': False,
 'i': False,
 'is': False,
 'for': False,
 'on': False,
 '-': False,
 'you': False,
 'my': False,
 'with': False,
 'that': False,
 'at': False,
 'by': False,
 'it': False,
 'this': False,
 'from': False,
 'are': False,
 'be': False,
 'have': False,
 'was': True,
 'like': False,
 'as': False,
 'just': False,
 'so': False,
 'but': False,
 'im': False,
 'up': False,
 '&amp;': False,
 'your': False,
 'not': False,
 'me': False,
 'its': False,
 'after': False,
 'will': False,
 'when': False,
 'has': False,
 'an': False,
 'all': False,
 'out': False,
 'if': False,
 'no': False,
 'we': False,
 '??': False,
 'get': False,
 'fire': False,
 'new': False,
 'via': False,
 'about': False,
 'dont': False,
 'more': False,
 'or': False,
 'they': False,
 'been': False,
 'what': False,
 'he': False,
 'how': False,
 'now': False,
 'over': False,
 'one': False,
 'people': False,
 'who': False,
 'into': False,
 'news': False,

In [970]:
test_tweet_46

['How',
 'the',
 'West',
 'was',
 'burned',
 'Thousands',
 'of',
 'wildfires',
 'ablaze',
 'in',
 'California',
 'alone']

In [971]:
# list(all_words)[:3000]
test_tweet_46

['How',
 'the',
 'West',
 'was',
 'burned',
 'Thousands',
 'of',
 'wildfires',
 'ablaze',
 'in',
 'California',
 'alone']

In [972]:
# all_words.keys()
list(all_words)[:30] 

['the',
 'a',
 'in',
 'to',
 'of',
 'and',
 'i',
 'is',
 'for',
 'on',
 '-',
 'you',
 'my',
 'with',
 'that',
 'at',
 'by',
 'it',
 'this',
 'from',
 'are',
 'be',
 'have',
 'was',
 'like',
 'as',
 'just',
 'so',
 'but',
 'im']