## 1. BASIC PROBABILITY

### Question 1: Write `compute_mean()` function

In [2]:
import numpy as np

In [3]:
def compute_mean(X):
    return np.mean(X)

In [4]:
X = [2, 0, 2, 2, 7, 4, -2, 5, -1, -1]

print("Mean : ", compute_mean(X))

Mean :  1.8


### Question 2: Write `compute_median()` function

In [5]:
def compute_median(X):
    size = len(X)
    X = np.sort(X)
    
    if size % 2 == 0:
        mid1 = int(size/2) - 1
        mid2 = int(size/2) + 1 - 1
        return (1/2 * (X[mid1] + X[mid2]))
    else:
        mid = int((size+1)/2) - 1
        return (X[mid])

In [6]:
X = [1, 5, 4, 4, 9, 13]
print("Median: ", compute_median(X))

Median:  4.5


### Question 3: Write `compute_std` function

In [7]:
def compute_std(X):
    mean = compute_mean(X)
    variance = 0
    
    for x in X:
        variance += pow(x - mean, 2)
        
    variance /= len(X)
    
    return np.sqrt(variance)

In [8]:
X = [ 171, 176, 155, 167, 169, 182]

print(np.round(compute_std(X),2))

8.33


### Question 4: Write `compute_correlation_coefficient` function

In [9]:
def compute_correlation_coefficient(X, Y):
    N = len(X)
    
    numerator = N * X.dot(Y) - np.sum(X) * np.sum(Y)
    denominator = np.sqrt(N * np.sum(np.square(X)) - pow(np.sum(X), 2)) * np.sqrt(N * np.sum(np.square(Y)) - pow(np.sum(Y), 2))
    
    return np.round(numerator/denominator, 2)

In [10]:
X = np.asarray([-2, -5, -11, 6, 4, 15, 9])
Y = np.asarray([4, 25, 121, 36, 16, 225, 81])

print("Correlation: ", compute_correlation_coefficient(X,Y))

Correlation:  0.42


## 2. TABULAR DATA ANALYSIS

### Question 5:

In [11]:
import pandas as pd

data = pd.read_csv('advertising.csv')

def correlation(x, y):
    N = len(x)
    
    numerator = N * np.sum(x.dot(y)) - np.sum(x) * np.sum(y)
    denominator = np.sqrt(N * np.sum(np.square(x)) - pow(np.sum(x), 2)) * np.sqrt(N * np.sum(np.square(y)) - pow(np.sum(y), 2))
    
    return numerator / denominator

x = data['TV']
y = data['Radio']

corr_xy = correlation(x, y)
print(round(corr_xy, 2))

0.05


### Question 6:

In [12]:
features = ['TV', 'Radio', 'Newspaper']

for feature_1 in features:
    for feature_2 in features:
        correlation_value = correlation(data[feature_1], data[feature_2])
        print(f"{feature_1} and {feature_2}: {round(correlation_value, 2)}")

TV and TV: 1.0
TV and Radio: 0.05
TV and Newspaper: 0.06
Radio and TV: 0.05
Radio and Radio: 1.0
Radio and Newspaper: 0.35
Newspaper and TV: 0.06
Newspaper and Radio: 0.35
Newspaper and Newspaper: 1.0


### Question 7:

In [13]:
x = data['Radio']
y = data['Newspaper']

result = np.corrcoef(x, y)
print(result)

[[1.         0.35410375]
 [0.35410375 1.        ]]


### Question 8:

In [14]:
# Calculate the correlation matrix
data_corr_coef = data.corr()
data_corr_coef

Unnamed: 0,TV,Radio,Newspaper,Sales
TV,1.0,0.054809,0.056648,0.901208
Radio,0.054809,1.0,0.354104,0.349631
Newspaper,0.056648,0.354104,1.0,0.15796
Sales,0.901208,0.349631,0.15796,1.0


### Question 9

In [None]:
import matplotlib as plt
import seaborn as sns

plt.figure(figsize=(10,8))
sns.heatmap(data_corr_coef, annot=True, fmt=".2f", linewidth=.5)
plt.show()

## 3. TEXT RETRIEVAL

### Question 10

In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
vi_data_df = pd.read_csv("vi_text_retrieval.csv")

context = vi_data_df['text']
context = [doc.lower() for doc in context ]

tfidf_vectorizer = TfidfVectorizer ()
context_embedded = tfidf_vectorizer.fit_transform(context)

context_embedded.toarray()[7][0]

0.31126580760710637

### Question 11:

In [36]:
def tfidf_search(question, tfidf_vectorizer, top_d=5):
    query_embedded = tfidf_vectorizer.transform([question.lower()])
    cosine_scores = cosine_similarity(context_embedded, query_embedded).reshape((-1,))
    
    results = []
    for idx in cosine_scores.argsort()[-top_d:][::-1]:
        doc = {
            'id': idx,
            'cosine_score': cosine_scores[idx]
        }
        results.append(doc)
        
    return results

In [37]:
question = vi_data_df.iloc[0]['question']
results = tfidf_search(question, tfidf_vectorizer, top_d=5)

results[0]['cosine_score']

0.6279910475266974

### Question 12:

In [38]:
def corr_search(question, tfidf_vectorizer, top_d=5):
    query_embedded = tfidf_vectorizer.transform([question.lower()])
    corr_scores = np.corrcoef(
        query_embedded.toarray()[0],
        context_embedded.toarray()
    )
    corr_scores = corr_scores[0][1:]
    results = []
    for idx in corr_scores.argsort()[-top_d:][::-1]:
        doc = {
            'id': idx,
            'corr_score':corr_scores[idx]
        }
        results.append(doc)
    return results

In [39]:
question = vi_data_df.iloc[0]['question']
results = corr_search(question, tfidf_vectorizer, top_d=5)

results[1]['corr_score']

0.2073424647197323