## Exploratory data analysis (part 1)
Supervised Learning

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import requests
from sklearn.feature_extraction.text import CountVectorizer
from scipy import spatial

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
df = pd.read_csv('Data/example_data1.csv')

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
# Eliminates rows with NA-values
df.dropna(axis=0)

In [None]:
# Eliminates coulmns with NA-values
df.dropna(axis=1)

In [None]:
# drops the rows where all of its values are NaN
df.dropna(how='all')

In [None]:
# it only drops the rows where NaN appear in the column ‘D’

df.dropna(subset=['D'])

In [None]:
df_car = pd.read_csv('Data/car_data.csv')
df_car['num-of-doors'].unique()



In [None]:
mean = df_car['engine-size'].mean()
df_car = df_car.replace('?',mean)
df_car['engine-size'].unique()

### SimpleImputer

In [None]:
# Replacing NaN with imputer

my_miss = np.NaN
my_stra = 'mean'

imputer = SimpleImputer(missing_values=my_miss, strategy=my_stra)

imputer = imputer.fit(df)
imputed_data = imputer.transform(df.values)
imputed_data


In [None]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')

imputer = imputer.fit(df)
imputed_data = imputer.transform(df.values)
imputed_data


In [None]:
df_car.head(3)

In [None]:
df_car.hist(figsize=(10,10))

In [None]:
# Which roww has most missing values?
df_car.isna().sum()

In [None]:

mean_before=df_car.loc[:,'normalized-losses'].mean()
std_before=df_car.loc[:,'normalized-losses'].std()

In [None]:

df_car=df_car.replace(np.NAN,mean_before)
mean_after=df_car.loc[:,"normalized-losses"].mean()
std_after=df_car.loc[:,"normalized-losses"].std()

In [None]:

print('mean before:',mean_before,'--->', \
'mean_after:',mean_after)

print('std before:',std_before,'--->', \
'std_after:',std_after)

In [None]:
data = {"house age [y]": [10,15,20,25,30,35,40], 
        "city 1 [m]": [1,0.5,0.3,0.2,0.1,0.09,0.08],
        "city 2 [m]": [35, 30, 25, 20, 15, 10,  5],
        "city 3 [m]": [10 ,9.9, 9.8, 9.6, 9.3, 9.1, 9.05],
        "city 4 [m]": [15 ,14.9, 14.8, 14.6, 14.3, 14.1, 14]}

house_df = pd.DataFrame(data)

In [None]:
width = 4
plt.bar(house_df.iloc[:,0], house_df.iloc[:,1], width)
plt.xlabel('house age [y]')
plt.ylabel('city 1 [m]')
plt.title('House cost')
plt.show()

In [None]:
house_df.loc[:,'city 1 [m]':'city 4 [m]'].plot(kind='bar')


In [None]:
mu = 0 
sigma = 1
rand_data = np.random.normal(mu, sigma, size=1000)
df_rand = pd.DataFrame({'rand1':rand_data})
df_rand.boxplot()

In [None]:
df_car.boxplot(column=['peak-rpm','price'])
df_car.hist(column=['price','peak-rpm'])

In [None]:
# Reading files 
# file = open(“filename”, “mode”)
# file.read()
# file.close()

# ‘r’ – read mode
# ‘w’ – write mode
# ‘a’ – append mode
# ‘r+’ – special read and write mode


In [None]:
# Reading files 
with open('Data/house_df.txt') as file:
    data = file.read()

In [None]:
# Wrtiting files
file = open("house_df.txt", "w") 
file.write('Hello World!')
file.close()


In [None]:
def read_write_75char(name):
    file1 = open(name, 'r')
    html_content = file1.read()
    file1.close()
    file2 = open('html_to_text.txt', 'w')
    file2.write(html_content[:75])
    file2.close()


In [None]:
# Testing function 
read_write_75char('Data/ParseMe.html')

### Response

In [None]:
response = requests.get('https://api.github.com/user')

In [None]:
response.status_code

In [None]:
response.headers['content-type']

In [None]:
response.encoding

In [None]:
response.text

In [None]:
url = 'https://www.imdb.com/title/tt0052357/reviews'
response = requests.get(url=url)
print(response.text[0:130])

### BeautifulSoup

In [None]:
html_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
movie_containers = html_soup.find_all('div', class_ = 'text show-more__control')
print(type(movie_containers))
print(len(movie_containers))

In [None]:
movie_containers[0]

In [None]:
file = open('Data/ParseMe.html', 'r')
html_doc = file.read()
file.close()

In [None]:
def parse_links(html_document=html_doc):
    html_soup = BeautifulSoup(html_document, 'html.parser')
    for link in html_soup.find_all('a'):
        print(link.get('href'))

# Not filling the function with any parameter because it is already defined
parse_links()

### Writing a Tokenizer

In [None]:
df_movie = pd.read_csv('Data/movie_dataset.csv')
df_movie.head(3)

In [None]:
def basic_tokenizer(text):
    return text.split()

review1 = df_movie.loc[0, 'review']
review2 = df_movie.loc[1, 'review']
review3 = df_movie.loc[2, 'review']

In [None]:
print(basic_tokenizer(review1))
print(basic_tokenizer(review2))
print(basic_tokenizer(review3))


### NLTK

In [None]:
def nltk_tokenizer(text):
    tokens = nltk.word_tokenize(text, preserve_line=True)
    return tokens

# review1 = df_movie.loc[0, 'review']
# review2 = df_movie.loc[1, 'review']
# review3 = df_movie.loc[2, 'review']

In [None]:
print(nltk_tokenizer(review1))
print(nltk_tokenizer(review2))
print(nltk_tokenizer(review3))
# returns tokens 

In [None]:
# nltk.download('stopwords')

In [None]:
stop = nltk.corpus.stopwords.words('english')
print(stop[0:5])
print(len(stop))

In [None]:
def tokens_wo_stopword(text):
    tokens = nltk.word_tokenize(text, preserve_line=True)
    no_stops = [tok for tok in tokens if tok not in stop]
    return no_stops

In [None]:
print(tokens_wo_stopword(review1))


### Reminder: List comprehenstion

In [None]:
tokens = nltk.word_tokenize(review1, preserve_line=True)
print(tokens)

In [None]:
# These are the same:
# [expression for item in list if condition]

# for item in list: 
#     if condition: 
#         expression

lst_comp = [tok for tok in tokens if tok not in stop]

### Normalization - Porter Stemmer

In [None]:
# Example: 
# U.S.A. should be matched with USA

In [None]:
porter = nltk.stem.porter.PorterStemmer()
def porter_stemmer(text):
    tokens = basic_tokenizer(text)
    return [porter.stem(tok) for tok in tokens]

In [None]:
print(porter_stemmer(review1))

In [None]:
def porter_stemmer_wo_stops(text):
    tokens = basic_tokenizer(text)
    return [porter.stem(tok) for tok in tokens if tok not in stop]

In [None]:
print(porter_stemmer_wo_stops(review1))

### Normalization - Lemmatization

In [None]:
nltk.download('wordnet')

In [None]:
def nltk_lemmatizer(text):
    tokens = basic_tokenizer(text)
    lemma = nltk.WordNetLemmatizer()
    return [lemma.lemmatize(tok) for tok in tokens]

In [None]:
print(nltk_lemmatizer(review1))

In [None]:
def nltk_lemmatizer_wo_stops(text):
    tokens = basic_tokenizer(text)
    lemma = nltk.WordNetLemmatizer()
    return [lemma.lemmatize(tok) for tok in tokens if tok not in stop]

In [None]:
print(nltk_lemmatizer_wo_stops(review1))

### Normalization - Bag of Words

In [None]:
count = CountVectorizer(ngram_range=(1,3))
docs = np.array(['The sun is shining',                              # Document 1
                 'The weather is sweet',                            # Document 2
                 'The sun is shining and the weather is sweet'])    # Document 3

In [None]:
bag = count.fit_transform(docs)
print('Vocabular:', count.vocabulary_)

In [None]:
for key in count.vocabulary_.keys():
    print(str(key))

In [None]:
bag_array = bag.toarray()
bag_array

### Cosine similarity

In [None]:
def compute_cos_sim(array1, array2):
    cos_sim = 1-spatial.distance.cosine(array1,array2)
    return cos_sim

In [None]:
compute_cos_sim(bag_array[0], bag_array[2])

In [None]:
def compute_cos_sim_all(my_array):
    n_rows = my_array.shape[0]
    cos_sim_array = np.zeros((n_rows,n_rows))
    for row1 in range (n_rows):
        for row2 in range (n_rows):
            cos_sim_array[row1, row2] = compute_cos_sim(my_array[row1,:], my_array[row2,:])
    return cos_sim_array


In [None]:
comssim_values = compute_cos_sim_all(bag_array)

In [None]:
fig = plt.figure()
ax = plt.imshow(comssim_values, cmap = 'winter')
cbar = fig.colorbar(ax)
plt.show()