# Data Cleaning

In [46]:
text = '''
Rose is a rose is a rose is a rose
'''
print(text)


Rose is a rose is a rose is a rose



In [49]:
with open('./rose.txt', 'r') as f:
    text = f.read()
    
print(text)

Rose is a rose is a rose is a rose


In [47]:
import string

text = 'Röse is a Rose, is 1 rose!?'

text=text.replace("\n"," ").replace("ä","ae").replace("Ä","Ae").replace("ö","oe").replace("Ö","oe").replace("ü","ue").replace("Ü","ue")

text = text.lower()

remove_digits = str.maketrans('', '', '0123456789')
text = text.translate(remove_digits)

text = text.translate(str.maketrans('','',string.punctuation))


print(text)

roese is a rose is  rose


# Tokenization

## 1. the python way

In [100]:
text = 'rose is a rose. The dot is magic'

#word tokenization < split by blank
token1 = text.split(' ')
print(token1)
##the method above is called white space tokenizer...

#...another useful example:
tweet = "@lindner lol, that was #awesome :)"
token2 = tweet.split(" ")
print(token2)

#sentence tokenization < split by dot
token3 = text.split('.')
print(token3)

['rose', 'is', 'a', 'rose.', 'The', 'dot', 'is', 'magic']
['@lindner', 'lol,', 'that', 'was', '#awesome', ':)']
['rose is a rose', ' The dot is magic']


## 2. with tokenizers from NLP libraries

see some word tokenizer demos here: https://text-processing.com/demo/tokenize/

### NLTK

In [101]:
from nltk.tokenize import word_tokenize

tokenized_word=word_tokenize(text)
print(tokenized_word)

['rose', 'is', 'a', 'rose', '.', 'The', 'dot', 'is', 'magic']


### Textblob

In [102]:
from textblob_de import Word
from textblob_de import TextBlobDE as TextBlob
transblob=TextBlob(text)
print(transblob)

rose is a rose. The dot is magic


In [105]:
#lowercase
##https://textblob.readthedocs.io/en/dev/api_reference.html
blob_Wl=transblob.lower()
print(blob_Wl)

rose is a rose. the dot is magic


In [106]:
#1. Hier werden uns die einzelnen Sätze genannt.
print("sentences: ", transblob.sentences, "\n") 

#2. Hier werden uns die einzelnen Elemente der Sätze genannt.
print("words1: ", transblob.tokens, "\n") 

#3. oder auch (hier kommt die punktation gleich mit raus)
print("words2: ", transblob.words, "\n") 

sentences:  [Sentence("rose is a rose."), Sentence("The dot is magic")] 

words1:  ['rose', 'is', 'a', 'rose', '.', 'The', 'dot', 'is', 'magic'] 

words2:  ['rose', 'is', 'a', 'rose', 'The', 'dot', 'is', 'magic'] 



# inspect Datasets

## json Files (ndjson)

see also: https://www.w3schools.com/python/pandas/pandas_json.asp

In [58]:
corpus = open('gutenberg-poetry-v001.ndjson', 'r')

In [59]:
import json

In [60]:
all_lines=[]
for line in corpus:
    all_lines.append(json.loads(line.strip()))

In [61]:
print(all_lines[:5])

[{'s': 'The Song of Hiawatha is based on the legends and stories of', 'gid': '19'}, {'s': 'many North American Indian tribes, but especially those of the', 'gid': '19'}, {'s': 'Ojibway Indians of northern Michigan, Wisconsin, and Minnesota.', 'gid': '19'}, {'s': 'They were collected by Henry Rowe Schoolcraft, the reknowned', 'gid': '19'}, {'s': 'Schoolcraft married Jane, O-bah-bahm-wawa-ge-zhe-go-qua (The', 'gid': '19'}]


In [82]:
import random
random.sample(all_lines, 8)
#random.sample?

[{'s': 'And howsoever pride may roll', 'gid': '36150'},
 {'s': 'Up in the air and down!', 'gid': '136'},
 {'s': '"It\'s the townsfolks\' cheery compliment', 'gid': '2863'},
 {'s': 'To endure another like it -- and another -- till I\'m dead?"',
  'gid': '1040'},
 {'s': "Coz there the men ain't nothin' more'n idees,--", 'gid': '3650'},
 {'s': '"You wouldn\'t ha\' never ben here but for me.', 'gid': '3650'},
 {'s': 'And bending down beside the glowing bars,', 'gid': '1304'},
 {'s': 'The vigilance to which we clung.', 'gid': '1165'}]

In [107]:
big_poem = " ".join([line['s'] for line in all_lines[:5]])
print(big_poem)
##"gid"

The Song of Hiawatha is based on the legends and stories of many North American Indian tribes, but especially those of the Ojibway Indians of northern Michigan, Wisconsin, and Minnesota. They were collected by Henry Rowe Schoolcraft, the reknowned Schoolcraft married Jane, O-bah-bahm-wawa-ge-zhe-go-qua (The


## csv Files

In [43]:
import pandas as pd
# Read the csv file
df = pd.read_csv("poetry.csv")

# First 5 rows
df.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


### Read out specific colums


#### Make the desired column as an index

In [70]:
#pd.read_csv?
see also: https://www.w3schools.com/python/pandas/pandas_csv.asp

In [72]:
# Read the csv file with 'Date' as index and parse_dates=True
df = pd.read_csv("poetry.csv", index_col='content', nrows=1)

# Display index
var1=df.index
print(var1)

Index(['Let the bird of loudest lay\r\nOn the sole Arabian tree\r\nHerald sad and trumpet be,\r\nTo whose sound chaste wings obey.\r\n\r\nBut thou shrieking harbinger,\r\nFoul precurrer of the fiend,\r\nAugur of the fever's end,\r\nTo this troop come thou not near.\r\n\r\nFrom this session interdict\r\nEvery fowl of tyrant wing,\r\nSave the eagle, feather'd king;\r\nKeep the obsequy so strict.\r\n\r\nLet the priest in surplice white,\r\nThat defunctive music can,\r\nBe the death-divining swan,\r\nLest the requiem lack his right.\r\n\r\nAnd thou treble-dated crow,\r\nThat thy sable gender mak'st\r\nWith the breath thou giv'st and tak'st,\r\n'Mongst our mourners shalt thou go.\r\n\r\nHere the anthem doth commence:\r\nLove and constancy is dead;\r\nPhoenix and the Turtle fled\r\nIn a mutual flame from hence.\r\n\r\nSo they lov'd, as love in twain\r\nHad the essence but in one;\r\nTwo distincts, division none:\r\nNumber there in love was slain.\r\n\r\nHearts remote, yet not asunder;\r\nDis