In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import random
import pandas as pd
pd.options.display.max_colwidth = 99999

In [2]:
# Convert csv files to ascii (you can get the original encoding running: file -i file.csv)
!iconv -c -f utf-8 -t ascii//TRANSLIT ../data/worldcities.csv -o ../data/worldcities_ascii.csv
!iconv -c -f utf-16 -t ascii//TRANSLIT ../data/countries.csv -o ../data/countries_ascii.csv

In [3]:
# Load data
words = [row.word for row in spark.read.csv('../data/words.csv', header=True).collect()]
names = [row.name for row in spark.read.csv('../data/baby-names.csv', header=True).limit(10).collect()]
cities = [row.city for row in spark.read.csv('../data/worldcities_ascii.csv', header=True).limit(10).collect()]
countries = [row.country for row in spark.read.csv('../data/countries_ascii.csv', header=True).limit(10).collect()]

In [4]:
# Create classes dictionary
dic = {
    'names' : '|'.join(names),
    'cities' : '|'.join(cities),
    'countries' : '|'.join(countries)
}

In [5]:
# Generate random dataframe with strings to classificate
words_per_text = 100
nrows = 100000
words_and_keywords = words + names + cities + countries
content = [' '.join([random.choice(words_and_keywords) for _ in range(words_per_text)]) for _ in range(nrows)]
df = spark.createDataFrame(content, StringType())

In [6]:
display(df.limit(10).toPandas())

Unnamed: 0,value
0,can hurt Shtime shorter in milk outer loud Henry past dinner price compare strip Dragash ready degree hurry music bright image trace list condition topic flow feel pony you pull account impossible horse cabin nature even continent sign quickly donkey pale flow setting wolf meet post bread lungs coat car condition experiment to real tales hurt image height ruler hour low flat cabin weight he doctor quietly swam fish double find stairs hidden bee is grade twenty combine lips topic eye prize south function team accurate past beautiful explore ahead obtain purpose receive myself remain nuts history either blanket log
1,rapidly bigger club valuable grandfather wrapped care between collect hand saw mirror nest perfect shoulder exist fought perhaps see nature money community hard seems eye function flow somewhere deep acres any hand gulf swam setting limited us cream surface successful likely scale fierce nest without topic stuck neighbor see past manner bicycle scientist quickly Kamenice afternoon live east numeral height farther say per column out famous past excitement soap pet under uncle control exactly seed slipped function seen labor branch above pupil chance right alphabet firm chance arrive body because word creature accurate gather settlers actually scientist verb gulf fewer
2,myself image rose dance fierce forgot send hurry mission ago tired grabbed only wonderful dead trick bottom anywhere when swing contrast wrapped wolf adventure flat wash drop spell gather therefore clock describe America jump center school tip box effort contrast die half short Henry word dry hurry word bottom term poetry fifth lady row rocky fish gasoline hard finally oil bag men income usual roar arrive helpful Charles several include push at trick beautiful life layers effort blanket movement Shterpce blind those gently out wild shirt create loss couple tape sure part exactly unhappy hour goes college skill same broke
3,poet pet shade wrapped when mission limited subject grade firm wear major eat several George without captured system bigger piece of anything fish branch flat gather hand exchange keep nation mud mail today special law check taught skill clearly George honor trip complex nothing recognize Viti trick actual pet grandfather above completely accurate which same straw chain piece uncle myself close too charge there equator dish flower create early asleep curious clearly likely seems judge gasoline drink population find direct bone wrapped daily system rubber degree Shtime teach usual even truck circle arrangement dead system rocky Dragash trip successful feel
4,garden equally many foreign crop salt number blind apple bright fireplace pull nodded nature whom hurt close please law special ancient quietly poetry effort river monkey labor shot too quickly donkey famous time flower tobacco garden foreign effort understanding usual individual tired body again married four rocky care hurt tune floor private lying collect progress purpose courage spell wheel refer word stiff shallow drink weak food east joy appropriate anywhere arrangement circle bee tobacco neck stick purpose valuable circle trip acres adventure mark series drop guess laugh choice continent year would surface feel hidden firm capital expect series coat using
5,shorter row whom foreign lying everyone star car kind scene lake weight numeral principal growth sat immediately donkey branch when path distant blanket location dear Robert leaf effort donkey oldest perhaps dry numeral involved baby sweet captured add usual widely mainly mark consonant topic blanket die compare folks run dollar having function hurry oil pupil likely dinner getting south under speech thread hall blind shoe solid swam say hunter wonderful roar through silver public past arrangement black getting aboard helpful human tune army fun pile account distant stove dawn can afternoon those choose slip common practical instrument down native orange
6,William directly charge cutting leave fur sign spring topic along accurate share possible bite help doll circle wash rather loss tired piano mainly visitor skill clothing Argentina own thousand equally colony log earlier steady swing factor noon allow widely mine flower hurry clothing perhaps myself horse find scientist whom possibly die listen acres cry won diameter broke again height unit grade fewer winter bite factor bag same anyone food sign growth create Angola pony built sun afternoon food clothes means goes studying scared life would collect steady quiet bee nodded tune law sail compound piece job buy picture care Algeria
7,getting wealth chief among pull low collect bag told without pure trace through numeral silver path military scared Afghanistan instant labor shot progress naturally layers equally school in battle myself see answer wear expect myself pull out sheet gift lying offer lips thousand like where affect sweet out Zubin Potok between particularly silver ill sight during rabbit property roar exist lead born child couple thirty nation spring charge widely hollow had mission shut market appropriate steady shore sign fireplace share perfect surface growth compound object enough bread planning floating problem replace swing orange dinner teach roof explanation spell experiment surface account
8,progress history island dead impossible meet you strip home eager therefore involved breakfast city William beneath Robert studying even describe widely silence consonant anything stone main having fewer neighbor lying pet bill solid provide anywhere Henry crop several wild spell shape grandfather beneath pupil form create major topic lake needed column continent twenty asleep contrast come doubt mine collect load circus hat among help exact island leaving length satellites off special visitor therefore summer arrive gift neighborhood contrast Kamenice money studying double fish limited population ruler form blind means blanket labor so blanket fierce off golden capital various scene baby
9,light when dry Thomas adventure deep limited Joseph shirt surface share practical Zubin Potok wild clothing loud cloth dead topic pile recognize seems fire writing piece sheet Prizren grabbed ago needed usual rocky success choice early obtain along difficulty bite dollar expect leave ruler fell bottom out doubt alone Malisheve eye edge mission ready child care massage charge history affect go once pale involved where gulf coat short actual please among John among bread shore tales mirror word surface clearly aboard thread fire pony damage art torn hurt price compound exist column arrangement unhappy roof remove along Vushtrri writing struggle stiff


In [7]:
# Applies classification over dataframe
df = df.withColumn('classification_class', F.when(df['value'].rlike(dic['names']), 'names') \
                                            .when(df['value'].rlike(dic['cities']), 'cities') \
                                            .when(df['value'].rlike(dic['countries']), 'countries') \
                                            .otherwise('undefined'))

In [8]:
display(df.limit(10).toPandas())

Unnamed: 0,value,classification_class
0,can hurt Shtime shorter in milk outer loud Henry past dinner price compare strip Dragash ready degree hurry music bright image trace list condition topic flow feel pony you pull account impossible horse cabin nature even continent sign quickly donkey pale flow setting wolf meet post bread lungs coat car condition experiment to real tales hurt image height ruler hour low flat cabin weight he doctor quietly swam fish double find stairs hidden bee is grade twenty combine lips topic eye prize south function team accurate past beautiful explore ahead obtain purpose receive myself remain nuts history either blanket log,names
1,rapidly bigger club valuable grandfather wrapped care between collect hand saw mirror nest perfect shoulder exist fought perhaps see nature money community hard seems eye function flow somewhere deep acres any hand gulf swam setting limited us cream surface successful likely scale fierce nest without topic stuck neighbor see past manner bicycle scientist quickly Kamenice afternoon live east numeral height farther say per column out famous past excitement soap pet under uncle control exactly seed slipped function seen labor branch above pupil chance right alphabet firm chance arrive body because word creature accurate gather settlers actually scientist verb gulf fewer,cities
2,myself image rose dance fierce forgot send hurry mission ago tired grabbed only wonderful dead trick bottom anywhere when swing contrast wrapped wolf adventure flat wash drop spell gather therefore clock describe America jump center school tip box effort contrast die half short Henry word dry hurry word bottom term poetry fifth lady row rocky fish gasoline hard finally oil bag men income usual roar arrive helpful Charles several include push at trick beautiful life layers effort blanket movement Shterpce blind those gently out wild shirt create loss couple tape sure part exactly unhappy hour goes college skill same broke,names
3,poet pet shade wrapped when mission limited subject grade firm wear major eat several George without captured system bigger piece of anything fish branch flat gather hand exchange keep nation mud mail today special law check taught skill clearly George honor trip complex nothing recognize Viti trick actual pet grandfather above completely accurate which same straw chain piece uncle myself close too charge there equator dish flower create early asleep curious clearly likely seems judge gasoline drink population find direct bone wrapped daily system rubber degree Shtime teach usual even truck circle arrangement dead system rocky Dragash trip successful feel,names
4,garden equally many foreign crop salt number blind apple bright fireplace pull nodded nature whom hurt close please law special ancient quietly poetry effort river monkey labor shot too quickly donkey famous time flower tobacco garden foreign effort understanding usual individual tired body again married four rocky care hurt tune floor private lying collect progress purpose courage spell wheel refer word stiff shallow drink weak food east joy appropriate anywhere arrangement circle bee tobacco neck stick purpose valuable circle trip acres adventure mark series drop guess laugh choice continent year would surface feel hidden firm capital expect series coat using,undefined
5,shorter row whom foreign lying everyone star car kind scene lake weight numeral principal growth sat immediately donkey branch when path distant blanket location dear Robert leaf effort donkey oldest perhaps dry numeral involved baby sweet captured add usual widely mainly mark consonant topic blanket die compare folks run dollar having function hurry oil pupil likely dinner getting south under speech thread hall blind shoe solid swam say hunter wonderful roar through silver public past arrangement black getting aboard helpful human tune army fun pile account distant stove dawn can afternoon those choose slip common practical instrument down native orange,names
6,William directly charge cutting leave fur sign spring topic along accurate share possible bite help doll circle wash rather loss tired piano mainly visitor skill clothing Argentina own thousand equally colony log earlier steady swing factor noon allow widely mine flower hurry clothing perhaps myself horse find scientist whom possibly die listen acres cry won diameter broke again height unit grade fewer winter bite factor bag same anyone food sign growth create Angola pony built sun afternoon food clothes means goes studying scared life would collect steady quiet bee nodded tune law sail compound piece job buy picture care Algeria,names
7,getting wealth chief among pull low collect bag told without pure trace through numeral silver path military scared Afghanistan instant labor shot progress naturally layers equally school in battle myself see answer wear expect myself pull out sheet gift lying offer lips thousand like where affect sweet out Zubin Potok between particularly silver ill sight during rabbit property roar exist lead born child couple thirty nation spring charge widely hollow had mission shut market appropriate steady shore sign fireplace share perfect surface growth compound object enough bread planning floating problem replace swing orange dinner teach roof explanation spell experiment surface account,cities
8,progress history island dead impossible meet you strip home eager therefore involved breakfast city William beneath Robert studying even describe widely silence consonant anything stone main having fewer neighbor lying pet bill solid provide anywhere Henry crop several wild spell shape grandfather beneath pupil form create major topic lake needed column continent twenty asleep contrast come doubt mine collect load circus hat among help exact island leaving length satellites off special visitor therefore summer arrive gift neighborhood contrast Kamenice money studying double fish limited population ruler form blind means blanket labor so blanket fierce off golden capital various scene baby,names
9,light when dry Thomas adventure deep limited Joseph shirt surface share practical Zubin Potok wild clothing loud cloth dead topic pile recognize seems fire writing piece sheet Prizren grabbed ago needed usual rocky success choice early obtain along difficulty bite dollar expect leave ruler fell bottom out doubt alone Malisheve eye edge mission ready child care massage charge history affect go once pale involved where gulf coat short actual please among John among bread shore tales mirror word surface clearly aboard thread fire pony damage art torn hurt price compound exist column arrangement unhappy roof remove along Vushtrri writing struggle stiff,names


In [9]:
# Validate results

print('Total: ', df.count())
namesCount = df.filter(df['value'].rlike(dic['names'])).count()
print('namesClass: ', namesCount)
citiesCount = df.filter(~(df['value'].rlike(dic['names'])) & (df['value'].rlike(dic['cities']))).count()
print('citiesClass: ', citiesCount)
countriesCount = df.filter(~(df['value'].rlike(dic['names'])) & ~(df['value'].rlike(dic['cities'])) & (df['value'].rlike(dic['countries']))).count()
print('countriesClass: ', countriesCount)
undefinedCount = df.filter(~(df['value'].rlike(dic['names'])) & ~(df['value'].rlike(dic['cities'])) & ~(df['value'].rlike(dic['countries']))).count()
print('undefined: ', undefinedCount)
print('Sum :', namesCount + citiesCount + countriesCount + undefinedCount)


Total:  100000
namesClass:  62216
citiesClass:  23597
countriesClass:  8910
undefined:  5277
Sum : 100000
