In [None]:
# when should we use a list or a dictionary?


In [1]:
# so: how many tweets written by Donald Trump (and only by him!) contain the word "great"?

import json

with open('../datasets/trump.json') as f:
    tweets = json.load(f)

print (len(tweets))

written_by_trump = 0
with_great = 0
for tweet in tweets:
    if "Android" in tweet["source"]:
        text = tweet["text"]
        written_by_trump += 1
        if "great" in text.lower():
            with_great += 1
        
print (with_great, with_great/len(tweets),with_great/written_by_trump)

35018
2236 0.06385287566394426 0.15372980405637676


In [13]:
# we need a few libraries
#os is needed to loop over files in a folder
# codecs is for encoding a file
#BeautifulSoup is needed for parsing the html of a scraped page

import codecs,os
from bs4 import BeautifulSoup

# we prepare an empty list
articles = []

# we loop over a folder
for filename in os.listdir("../datasets/Articles/"):
    # we check if the file is a txt file
    if ".txt" in filename:
        print (filename)
        # we open and read the file
        doc = open("../datasets/Articles/"+filename,"r")
        doc = doc.read()
        
        # look at the original HTML - we remove the internet archive toolbar
        html_page = str(doc).split("<!-- END WAYBACK TOOLBAR INSERT -->")[1]
            
        # we parse the page
        html_page = BeautifulSoup(html_page, "lxml")
        
        
        # we open a new file in writing mode (using codecs) / we need to create the "CleanedArticles" folder if it's not there
        output = codecs.open("../datasets/CleanedArticles/"+filename,"w","utf-8")
        
        # we define a new list, called text
        text = []

        # simply take all the paragraphs - we search for the elements "p"
        for para in html_page.find_all('p'):
            # we remove breaklines, tabs etc
            para = para.text.replace("\n"," ").replace("\t"," ").replace("\r"," ")
            text.append(para)
        
        # we put all paragraphs in a single text
        text = " ".join(text)
        
        # we write the text to the output file
        output.write(text)
        # we close the output file
        output.close()
        # we add the text to a list of articles
        articles.append(text)

15.txt
16.txt
13.txt
12.txt
10.txt
11.txt
9.txt
8.txt
5.txt
4.txt
6.txt
7.txt
3.txt
2.txt
0.txt
1.txt
19.txt
18.txt


In [15]:
# let's take the first article of the list of articles
article = articles[0]
print (article)

Advertisement By JACEY FORTINDEC. 31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage. (Sydney was among the first major cities to celebrate with fireworks at the stroke of midnight.) In Japan, people paraded in fox masks to attend the first prayer of the year at a Shinto shrine in Tokyo. In the Philippines, revelers gathered — phones in hand — at the Eastwood Mall in Manila to watch balloons and confetti rain down at midnight. Big pots of tea were prepared for New Year’s Eve celebrations in Beijing. The country will also celebrate the Lunar New Year, in February. It was raining in Singapore, but New Year’s Eve celebrants sheltered under umbrellas and raincoats as fireworks sparkled overhead. Tourists donned party hats to watch fireworks in front of the famous Petronas Twin Towers in Kuala Lumpur, Malaysia. Hundreds of couples got married at a mass wedding in Jakarta on New Year’s Eve. We’re interested

In [5]:
# we need nltk - one of the most used text processing library in python

import nltk # --> documentation: http://www.nltk.org/

# you will also need this
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/federiconanni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
# we start by dividing the text into sentences
sentences = nltk.sent_tokenize(article) # <-- documentation for this command: http://www.nltk.org/_modules/nltk/tokenize.html

# let's print all the sentences, so we can exam the quality of the output
for sentence in sentences:
    print (sentence)
    print (" ")


Advertisement By JACEY FORTINDEC.
 
31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage.
 
(Sydney was among the first major cities to celebrate with fireworks at the stroke of midnight.)
 
In Japan, people paraded in fox masks to attend the first prayer of the year at a Shinto shrine in Tokyo.
 
In the Philippines, revelers gathered — phones in hand — at the Eastwood Mall in Manila to watch balloons and confetti rain down at midnight.
 
Big pots of tea were prepared for New Year’s Eve celebrations in Beijing.
 
The country will also celebrate the Lunar New Year, in February.
 
It was raining in Singapore, but New Year’s Eve celebrants sheltered under umbrellas and raincoats as fireworks sparkled overhead.
 
Tourists donned party hats to watch fireworks in front of the famous Petronas Twin Towers in Kuala Lumpur, Malaysia.
 
Hundreds of couples got married at a mass wedding in Jakarta on New Year’s Eve

In [17]:
# let's consider a single sentence - how do we do that? 

sentence = sentences[1]
print (sentence)

31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage.


In [18]:
# let's divide the sentence in tokens (aka single words)
tokenized_sentence = nltk.word_tokenize(sentence)

print (tokenized_sentence)

['31', ',', '2017', 'In', 'Sydney', ',', 'rainbow', 'fireworks', 'sparkled', 'off', 'the', 'Harbour', 'Bridge', 'in', 'celebration', 'of', 'Australia’s', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [19]:
# lower-casing the sentence
without_capital_letters = [word.lower() for word in tokenized_sentence]

print (without_capital_letters)

# homework: write a for-loop for doing the same thing

['31', ',', '2017', 'in', 'sydney', ',', 'rainbow', 'fireworks', 'sparkled', 'off', 'the', 'harbour', 'bridge', 'in', 'celebration', 'of', 'australia’s', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [20]:
# removing stopwords

# homework: download stopwords <- google it out

from nltk.corpus import stopwords

stop = stopwords.words('english')

# what is "stop" ?

print (stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [None]:
without_stop_words = [word for word in without_capital_letters if word not in stop]

print (without_stop_words)

In [None]:
# homework: how do we exclude punctuation? and numbers?
