# challenge 1

In [1]:
import nltk
from nltk import word_tokenize

In [2]:
text = "Ironhack is a Global Tech School ranked num 2 worldwide. Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do. This ideology is reflected in our teaching practices, which consist of a nine-weeks immersive programming, UX/UI design or Data Analytics course as well as a one-week hiring fair aimed at helping our students change their career and get a job straight after the course. We are present in 8 countries and have campuses in 9 locations - Madrid, Barcelona, Miami, Paris, Mexico City,  Berlin, Amsterdam, Sao Paulo and Lisbon."

In [3]:
print(word_tokenize(text))

['Ironhack', 'is', 'a', 'Global', 'Tech', 'School', 'ranked', 'num', '2', 'worldwide', '.', 'Our', 'mission', 'is', 'to', 'help', 'people', 'transform', 'their', 'careers', 'and', 'join', 'a', 'thriving', 'community', 'of', 'tech', 'professionals', 'that', 'love', 'what', 'they', 'do', '.', 'This', 'ideology', 'is', 'reflected', 'in', 'our', 'teaching', 'practices', ',', 'which', 'consist', 'of', 'a', 'nine-weeks', 'immersive', 'programming', ',', 'UX/UI', 'design', 'or', 'Data', 'Analytics', 'course', 'as', 'well', 'as', 'a', 'one-week', 'hiring', 'fair', 'aimed', 'at', 'helping', 'our', 'students', 'change', 'their', 'career', 'and', 'get', 'a', 'job', 'straight', 'after', 'the', 'course', '.', 'We', 'are', 'present', 'in', '8', 'countries', 'and', 'have', 'campuses', 'in', '9', 'locations', '-', 'Madrid', ',', 'Barcelona', ',', 'Miami', ',', 'Paris', ',', 'Mexico', 'City', ',', 'Berlin', ',', 'Amsterdam', ',', 'Sao', 'Paulo', 'and', 'Lisbon', '.']


# challenge 2

In [4]:
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

def clean_up(text):
    text = text.lower()
    text = re.sub('@.+? ', '', text)
    text = re.sub('htt.+? ', '', text)
#     text = re.sub('www.+? ', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[^a-z]+',' ',text).strip()
    return text

In [5]:
def tokenize(string):
    return word_tokenize(string)

def stem_and_lemmatize(str_list):
    stemmer = SnowballStemmer("english")
    lemming = WordNetLemmatizer()
    s_and_l = []
    for word in str_list:
        word = lemming.lemmatize(word, pos='v')
        s_and_l.append(stemmer.stem(word))
    return s_and_l

In [6]:
from nltk.corpus import stopwords

def remove_stopwords(s_and_l):
    bad_list = set(stopwords.words('english'))
    output = []
    for word in s_and_l:
        if word not in bad_list:
            output.append(word)
    return output

# challenge 3

In [7]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import zipfile
#nltk.download('vader_lexicon')

In [8]:
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(text)

{'compound': 0.9393, 'neg': 0.0, 'neu': 0.828, 'pos': 0.172}

In [9]:
%%time
zf = zipfile.ZipFile('Sentiment140.csv.zip')
df = pd.read_csv(zf.open('Sentiment140.csv'))

Wall time: 4.14 s


In [10]:
df['text_processed'] = df['text']

In [11]:
%%time
df['text_processed'] = df['text_processed'].apply(clean_up)
print(df['text_processed'][:3])

0    awww thats a bummer you shoulda got david carr...
1    is upset that he cant update his facebook by t...
2    i dived many times for the ball managed to sav...
Name: text_processed, dtype: object
Wall time: 31.2 s


In [12]:
%%time
df['text_processed'] = df['text_processed'].apply(tokenize)
print(df['text_processed'][:3])

0    [awww, thats, a, bummer, you, shoulda, got, da...
1    [is, upset, that, he, cant, update, his, faceb...
2    [i, dived, many, times, for, the, ball, manage...
Name: text_processed, dtype: object
Wall time: 2min 17s


In [13]:
%%time
df['text_processed'] = df['text_processed'].apply(remove_stopwords)
print(df['text_processed'][:3])

0    [awww, thats, bummer, shoulda, got, david, car...
1    [upset, cant, update, facebook, texting, might...
2    [dived, many, times, ball, managed, save, rest...
Name: text_processed, dtype: object
Wall time: 5min 11s


In [14]:
%%time
df['text_processed'] = df['text_processed'].apply(stem_and_lemmatize)
print(df['text_processed'][:3])

0    [awww, that, bummer, shoulda, get, david, carr...
1    [upset, cant, updat, facebook, text, might, cr...
2    [dive, mani, time, ball, manag, save, rest, go...
Name: text_processed, dtype: object
Wall time: 2min 42s


In [15]:
df.head()

Unnamed: 0,target,id,date,flag,user,text,text_processed
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[awww, that, bummer, shoulda, get, david, carr..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, cant, updat, facebook, text, might, cr..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[dive, mani, time, ball, manag, save, rest, go..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[behav, im, mad, cant, see]"


In [16]:
lst = []
for word in df.text_processed:
    lst += word
len(lst)

11403113

In [35]:
from nltk.probability import FreqDist
cfd = FreqDist(lst)
mostest = cfd.most_common()[:5000]
bow = [x[0] for x in mostest]
len(bow)

5000

In [36]:
def find_features(document):
    features = {}
    for w in bow:
        features[w] = (w in document)
    
    analyzer = SentimentIntensityAnalyzer().polarity_scores(" ".join(document))
    if analyzer["pos"] > 0.2:
        analyzer = True
    else:
        analyzer = False
    
    return (features, analyzer)

In [40]:
n = 10000; b = n/65
aaa = df[:n]
print(f"if each second my laptop can process {round(b)} tweets\nthen it would take {round((1600000/b)/3600,2)} hours for the full 1.6M tweets")

if each second my laptop can process 154 tweets
then it would take 2.89 hours for the full 1.6M tweets


In [38]:
%%time
feature = aaa.text_processed.apply(find_features)

Wall time: 1min 5s


In [42]:
%%time
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.classify import accuracy as acc

training_set = feature[:5000]
testing_set = feature[5000:]

classifier = nbc.train(training_set)
print("Classifier accuracy percent:",(acc(classifier, testing_set))*100)

Classifier accuracy percent: 81.46
Wall time: 1min 13s
