# Python Basic

### File Operations

Read file line by line

In [50]:
lines = []
for line in open('building_global_community.txt'):
    # delete the blank and line feed at the begining and end
    line = line.strip()
    # add processed line text into list 'lines'
    lines.append(line)

In [51]:
print (lines[0])

To our community,


or you can just write

In [10]:
# list comprehension
lines = [line.strip() for line in open('building_global_community.txt')]

In [11]:
lines[0]

'To our community,'

### String operations

In [12]:
sentence = "I want to eat an apple ."

#### string indexing

In [6]:
sentence[5]

't'

In [7]:
sentence[10:13]

'eat'

In [8]:
sentence[-1]

'.'

In [9]:
sentence[10:-3]

'eat an appl'

#### find sequences in string

In [13]:
sentence.find('a')

3

In [14]:
sentence.find('aaa')  ＃如果要找的值不再字串裡，就會回傳 -1

SyntaxError: invalid character in identifier (<ipython-input-14-d72709917bae>, line 1)

find from right-hand side

In [15]:
sentence.rfind('a')

17

find with a starting point

In [16]:
sentence.find('a', 4)

11

return -1 when not found

In [17]:
sentence.find('can')

-1

combine the use of subsequence and find

In [15]:
sentence[sentence.find('want to'):sentence.rfind('.')]

'want to eat an apple '

### String Normalization

In [16]:
sentence

'I want to eat an apple .'

In [17]:
sentence.lower()

'i want to eat an apple .'

In [18]:
sentence.upper()

'I WANT TO EAT AN APPLE .'

In [19]:
sentence.capitalize()

'I want to eat an apple .'

In [20]:
'A'.isupper()

True

In [21]:
'A'.islower()

False

In [22]:
'apple'.isalpha()

True

In [23]:
'20'.isdigit()

True

In [24]:
'20.9'.isdigit()

False

In [25]:
'20'.isdecimal()

True

In [26]:
'furen5566'.isalnum()

True

### split sentence by blank

In [27]:
# the result is list of words in the sentence
sentence.split(' ')

['I', 'want', 'to', 'eat', 'an', 'apple', '.']

In [28]:
sentence.endswith('.')

True

In [29]:
sentence.startswith('He wants')

False

## Dictionary examples

In [30]:
# book = dict()
book = {}

In [31]:
# we can assign any value to any key
book['title'] = 'Natural Language Processing with Python'
book['author'] = 'Bird, Klein, and Loper'
book['year'] = 2009

In [32]:
book

{'author': 'Bird, Klein, and Loper',
 'title': 'Natural Language Processing with Python',
 'year': 2009}

In [33]:
book.keys()

dict_keys(['year', 'author', 'title'])

In [34]:
book.values()

dict_values([2009, 'Bird, Klein, and Loper', 'Natural Language Processing with Python'])

In [35]:
book.items()

dict_items([('year', 2009), ('author', 'Bird, Klein, and Loper'), ('title', 'Natural Language Processing with Python')])

string formatting

In [36]:
'%s is a book written by %s in %d' % (book['title'], book['author'], book['year'])

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

In [37]:
'{0} is a book written by {1} in {2}'.format(book['title'], book['author'], book['year'])

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

In [38]:
# advanced formatting
'{title} is a book written by {author} in {year}'.format(**book)

'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'

## Counting Example

In [18]:
data = ['red', 'red', 'red', 'red', 'yellow', 'yellow', 'yellow', 'blue', 'blue']

In [19]:
counter = dict()
for color in data:
    if color in counter:
        counter[color] += 1
    else:
        counter[color] = 1

In [20]:
counter

{'blue': 2, 'red': 4, 'yellow': 3}

### use default dictionary

In [21]:
from collections import defaultdict
counter = defaultdict(lambda: 0)  # default value function is 0
counter = defaultdict(int)  # default value function is "int", which initialize to 0

In [22]:
for color in data:
    counter[color] += 1

In [23]:
counter

defaultdict(int, {'blue': 2, 'red': 4, 'yellow': 3})

### use built-in Counter

In [24]:
from collections import Counter

In [25]:
counter = Counter(data)

In [26]:
counter

Counter({'blue': 2, 'red': 4, 'yellow': 3})

In [27]:
new_data = ['blue', 'red', 'blue', 'yellow', 'blue', 'yellow', 'blue', 'yellow', 'blue']
counter.update(new_data)

In [28]:
counter

Counter({'blue': 7, 'red': 5, 'yellow': 6})

#### most common elements

In [29]:
counter.most_common()

[('blue', 7), ('yellow', 6), ('red', 5)]

In [30]:
counter.most_common(2)

[('blue', 7), ('yellow', 6)]

In [31]:
for color, count in counter.most_common():
    print('{0}: {1}'.format(color, count))

blue: 7
yellow: 6
red: 5


In [41]:
# clear counter
counter.clear()
print(counter['blue'])

0


# Exercise

compute the word frequencies in "Building_Global_Community.txt"
- read sentences from file "Building_Global_Community.txt"
- split sentences into words (split, or nltk word_tokenize)
- filter out symbols (isalpha, isdigit, isalnum)
- normalize words and count ('Word' and 'word' are considered as the same word)
- count the occurance of words (counting exmaple)

write your code here

In [27]:
sentences = []
position = 0
for line in open('building_global_community.txt'):
    # 刪減前後的空白(與換行)
    line = line.strip('')
    #print (line)
    
    # 將處理好的字串加入 sentences 
    sentences.append(line)

    #split sentences into words (split)
    sentence = ''.join(sentences)
    
sentence = sentence.replace('-\n','')
sentence = sentence.replace('-',' ')
print (sentence)

To our community,

On our journey to connect the world, we often discuss products we're building and updates on our business. Today I want to focus on the most important question of all: are we building the world we all want?
History is the story of how we've learned to come together in ever greater numbers    from tribes to cities to nations. At each step, we built social infrastructure like communities, media and governments to empower us to achieve things we couldn't on our own.
Today we are close to taking our next step. Our greatest opportunities are now global    like spreading prosperity and freedom, promoting peace and understanding, lifting people out of poverty, and accelerating science. Our greatest challenges also need global responses    like ending terrorism, fighting climate change, and preventing pandemics. Progress now requires humanity coming together not just as cities or nations, but also as a global community. 
This is especially important right now. Facebook stand

In [28]:
# build stopwords
from nltk import word_tokenize
print(word_tokenize(sentence))



In [29]:
from nltk import wordpunct_tokenize
print(wordpunct_tokenize(sentence))



In [30]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Chiali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
import string
# add more
stops.update(string.ascii_letters + string.punctuation + string.digits)  
#如果我們
stops.update(('--'))

In [2]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
# you can add one at a time
for symbol in string.punctuation:
    stops.add(symbol)
# or update a sequence into the set
stops.update(string.punctuation)

In [4]:
words = word_tokenize(sentence)
# for word in words:
#     if word not in stopwords:
#         print(word)
print([word for word in words if word not in stops])

NameError: name 'word_tokenize' is not defined

In [35]:
words = [word for word in words if word not in stops]
clean = [word for word in words if word.isalpha()]

In [36]:
from collections import Counter
counter = Counter(clean)


In [37]:
counter.most_common(20)

[('community', 72),
 ('people', 61),
 ('us', 38),
 ('world', 36),
 ('social', 29),
 ('communities', 26),
 ('help', 26),
 ('We', 25),
 ('infrastructure', 24),
 ('Facebook', 24),
 ('global', 24),
 ('groups', 23),
 ('content', 23),
 ('many', 21),
 ('share', 20),
 ('like', 19),
 ('important', 19),
 ('around', 18),
 ('together', 18),
 ('building', 17)]

In [38]:
counter.clear()

### Save the result into a csv file

https://docs.python.org/3/library/csv.html

In [85]:
import csv

write word count result

In [86]:
with open('wordcount.csv', 'w') as csvfile:
    # set up header
    fieldnames = ['word', 'count']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for word, count in counter.most_common():
        writer.writerow({'word': word, 'count': count})

read csv

In [87]:
with open('wordcount.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        print(row['word'], row['count'])

community 80
people 62
us 38
world 36
social 31
help 26
communities 25
facebook 24
infrastructure 24
global 24
groups 23
content 23
many 22
share 20
building 19
important 19
like 19
together 18
around 18
new 16
build 16
friends 15
want 14
even 14
local 13
must 13
across 13
news 12
safe 12
need 12
issues 11
work 11
standards 11
see 11
media 10
today 10
common 10
connect 10
tools 10
come 10
one 10
personal 10
built 10
also 9
part 9
well 9
hope 9
may 8
opportunity 8
example 8
always 8
understanding 8
whether 8
every 8
time 8
civic 8
different 8
greatest 8
information 8
society 7
take 7
safety 7
make 7
two 7
engagement 7
positive 7
past 7
collective 7
system 7
perspectives 7
cultural 7
group 7
years 7
online 7
support 7
find 6
informed 6
often 6
sensationalism 6
give 6
experience 6
values 6
worldwide 6
reading 6
representatives 6
could 6
year 6
strengthen 6
existing 6
reflect 6
way 6
norms 6
physical 6
meaningful 6
seen 6
connecting 6
ideas 6
impact 6
largest 6
taking 6
harm 6
use 5
cities