# 03_02: Loading Text Files

In [4]:
import math
import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

%matplotlib inline

In [5]:
# iterating over an open file yields its lines, one by one

words = []
for line in open('words.txt', 'r'):
    words.append(line)

In [6]:
len(words)

235886

In [7]:
words[:10]

['A\n',
 'a\n',
 'aa\n',
 'aal\n',
 'aalii\n',
 'aam\n',
 'Aani\n',
 'aardvark\n',
 'aardwolf\n',
 'Aaron\n']

In [8]:
'Aaron\n'.strip()

'Aaron'

In [9]:
'Aaron\n'.strip().lower()

'aaron'

In [10]:
# just to test the method, but won't be used in the final version
'Aaron\n'.strip().upper()

'AARON'

In [11]:
words = []
for line in open('words.txt', 'r'):
    words.append(line.strip().lower())

In [12]:
words[:10]

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron']

In [13]:
words = set()
for line in open('words.txt', 'r'):
    words.add(line.strip().lower())

In [None]:
# a set comprehension that collects stripped and lowercased lines...
# a set comprehension is like a list comprehension, but uses curly braces
# instead of square brackets
words = {line.strip().lower() for line in open('words.txt', 'r')}

In [15]:
words

{'mingelen',
 'compost',
 'happiless',
 'precongestion',
 'fitweed',
 'fritterer',
 'mult',
 'gynomonoeciously',
 'heterocerous',
 'redisperse',
 'skunkery',
 'denizenize',
 'handicapped',
 'nummulitoid',
 'chaplainship',
 'crotched',
 'enhydros',
 'boronic',
 'intraspinal',
 'aligreek',
 'bifidly',
 'stambouline',
 'overbearance',
 'involutedly',
 'basicity',
 'focimetry',
 'bacchante',
 'swingletree',
 'cyrenaic',
 'knoller',
 'vitelliferous',
 'bootikin',
 'immortified',
 'prorestoration',
 'baikerite',
 'godwinian',
 'paeanism',
 'petalodus',
 'daphnioid',
 'trustwoman',
 'frenchy',
 'wedgewise',
 'britishness',
 'ust',
 'azonaphthalene',
 'unabbreviated',
 'eddaic',
 'valkyrie',
 'oxysulphate',
 'newlywed',
 'supportable',
 'plaiting',
 'lashlite',
 'mortiferously',
 'precirculate',
 'unimmersed',
 'sauropod',
 'uncleanable',
 'unsevere',
 'jazerant',
 'australianize',
 'pachydermial',
 'psychopannychy',
 'palaeotypographist',
 'slidably',
 'doctoral',
 'embryopathology',
 'lamiac

In [16]:
# ...turned into a sorted list
words = sorted({line.strip().lower() for line in open('words.txt', 'r')})

In [17]:
words

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic',
 'aaronical',
 'aaronite',
 'aaronitic',
 'aaru',
 'ab',
 'aba',
 'ababdeh',
 'ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abanic',
 'abantes',
 'abaptiston',
 'abarambo',
 'abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abassin',
 'abastardize',
 'abatable',
 'abate',
 'a

In [18]:
len(words)

234371

In [19]:
# return a list of all lines from an open file 
open('francais.txt', 'r', encoding='latin1').readlines()

['\n',
 'a\n',
 'ab\n',
 'abaissa\n',
 'abaissai\n',
 'abaissaient\n',
 'abaissais\n',
 'abaissait\n',
 'abaissant\n',
 'abaissas\n',
 'abaissasse\n',
 'abaissassent\n',
 'abaissasses\n',
 'abaissassiez\n',
 'abaissassions\n',
 'abaissâmes\n',
 'abaissât\n',
 'abaissâtes\n',
 'abaisse\n',
 'abaissement\n',
 'abaissements\n',
 'abaissent\n',
 'abaisser\n',
 'abaissera\n',
 'abaisserai\n',
 'abaisseraient\n',
 'abaisserais\n',
 'abaisserait\n',
 'abaisseras\n',
 'abaisserez\n',
 'abaisseriez\n',
 'abaisserions\n',
 'abaisserons\n',
 'abaisseront\n',
 'abaisses\n',
 'abaisseur\n',
 'abaisseurs\n',
 'abaissez\n',
 'abaissé\n',
 'abaissée\n',
 'abaissées\n',
 'abaissés\n',
 'abaissèrent\n',
 'abaissiez\n',
 'abaissions\n',
 'abaissons\n',
 'abandon\n',
 'abandonna\n',
 'abandonnai\n',
 'abandonnaient\n',
 'abandonnais\n',
 'abandonnait\n',
 'abandonnant\n',
 'abandonnas\n',
 'abandonnasse\n',
 'abandonnassent\n',
 'abandonnasses\n',
 'abandonnassiez\n',
 'abandonnassions\n',
 'abandonnâmes\