In [5]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Numpy tasks

For a detailed reference check out: https://numpy.org/doc/stable/reference/arrays.indexing.html.

**Task 1.** Calculate the sigmoid (logistic) function on every element of the following numpy array [0.3, 1.2, -1.4, 0.2, -0.1, 0.1, 0.8, -0.25] and print the last 5 elements. Use only vector operations.

In [2]:
# Write your code here
array = np.array([0.3, 1.2, -1.4, 0.2, -0.1, 0.1, 0.8, -0.25])
sigm = lambda x: 1/(1+np.exp(-x))
array2 = sigm(array[-5:])
print(array2)


[0.549834   0.47502081 0.52497919 0.68997448 0.4378235 ]


**Task 2.** Calculate the dot product of the following two vectors:<br/>
$x = [3, 1, 4, 2, 6, 1, 4, 8]$<br/>
$y = [5, 2, 3, 12, 2, 4, 17, 11]$<br/>
a) by using element-wise mutliplication and np.sum,<br/>
b) by using np.dot,<br/>
b) by using np.matmul and transposition (x.T).

In [18]:
# Write your code here
x= np.array([3,1,4,2,6,1,4,8])
y= np.array([5,2,3,12,2,4,17,11])

print("a " +str(np.sum(x*y)))
print("b " + str(np.dot(x, y)))

print("c " + str(np.matmul(x, y)))



a 225
b 225
c 225


**Task 3.** Calculate value of the logistic model<br/>
$$y = \frac{1}{1 + e^{-x_0 \theta_0 - \ldots - x_9 \theta_9 - \theta_{10}}}$$
for<br/>
$x = [1.2, 2.3, 3.4, -0.7, 4.2, 2.7, -0.5, 1.4, -3.3, 0.2]$<br/>
$\theta = [2.7, 0.33, -2.12, -1.73, 2.9, -5.8, -0.9, 12.11, 3.43, -0.5, -1.65]$<br/>
and print the result. Use only vector operations.

In [37]:
# Write your code here
x = np.array([1.2,2.3,3.4,-0.7,4.2,2.7,-0.5,1.4,-3.3,0.2])
theta = np.array([2.7,0.33,-2.12,-1.73,2.9,-5.8,-0.9,12.11,3.43,-0.5,-1.65])

fun = lambda x,y: -x*y
power = np.sum(fun(x[:],theta[:len(theta)-1]))
y = 1/(1+np.exp(power - theta[len(theta)-1]))
print(y)

0.2417699832615572


**Task 4.** Calculate value of the multivariate linear regression model<br/>
$$y = A x + B$$
for<br/>
$A = \begin{bmatrix} 1 & 2 & 1 \\ 3 & 0 & 1 \end{bmatrix}$<br/>
$B = \begin{bmatrix} 0.2 \\ 0.3 \end{bmatrix}$<br/>
$x = [1, 2, 3]^T$<br/>
and print the result. Use only vector and matrix operations.

In [16]:
# Write your code here
A = np.array([1,2,1,3,0,1]).reshape(2,3)
B = np.array([0.2,0.3]).reshape(2,1)
print()
x = np.array([1,2,3]).reshape(3,1)
result = (np.matmul(A,x)+B)
print(result)


[[1 2 1]
 [3 0 1]]
[[0.2]
 [0.3]]

[[8.2]
 [6.3]]


# Pandas

## Load datasets

- Steam (https://www.kaggle.com/tamber/steam-video-games)

- MovieLens (https://grouplens.org/datasets/movielens/)

In [7]:
steam_df = pd.read_csv(os.path.join("data", "steam", "steam-200k.csv"), 
                       names=['user-id', 'game-title', 'behavior-name', 'value', 'zero'])

ml_ratings_df = pd.read_csv(os.path.join("data", "movielens_small", "ratings.csv"))
ml_movies_df = pd.read_csv(os.path.join("data", "movielens_small", "movies.csv"))

## Merge both MovieLens DataFrames into one

In [68]:
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='movieId')
ml_df.tail(10)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
100826,610,147662,3.0,1479544214,Return of the One-Armed Swordsman (1969),Action|Adventure
100827,610,148166,3.5,1479542224,Hitchcock/Truffaut (2015),Documentary
100828,610,149011,3.5,1493848843,He Never Died (2015),Comedy|Drama|Horror
100829,610,152372,3.5,1493848841,Southbound (2016),Horror
100830,610,158721,3.5,1479542491,Gen-X Cops (1999),Action|Comedy|Thriller
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller
100835,610,163981,3.5,1493850155,31 (2016),Horror


## Pandas tasks - Steam dataset

**Task 5.** How many people made a purchase in the Steam dataset? Remember that a person could buy many games, but you need to count every person once.

In [9]:
# Write your code here
steam_g = steam_df.loc[steam_df['behavior-name'] == 'purchase']
print(len(pd.unique(steam_g['user-id'])))

12393


**Task 6.** How many people made a purchase of "The Elder Scrolls V Skyrim"?

In [10]:
# Write your code here
steam_g = steam_df.loc[steam_df['game-title'] == 'The Elder Scrolls V Skyrim']
x = pd.unique(steam_g['user-id'])
print(len(x))

717


**Task 7.** How many purchases people made on average?

In [26]:
# Write your code here
users = len(pd.unique(steam_df['user-id']))
cond2 = steam_df['behavior-name'] == 'purchase'
g = steam_df[cond2]
group = g.groupby('user-id').sum()

print(group['value'].mean())



10.45033486645687


**Task 8.** Who bought the most games?

In [24]:
# Write your code here
cond2 = steam_df['behavior-name'] == 'purchase'
g = steam_df[cond2].groupby('user-id').sum()
g = g.sort_values(by='value', ascending=False).reset_index()
g = g.loc[:, ['user-id','value']]
x = g['user-id'].head(1)
print(int(x))

62990992


**Task 9.** How many hours on average people played in "The Elder Scrolls V Skyrim"?

In [45]:
# Write your code here
cond = steam_df['behavior-name'] == 'play'
cond2 = steam_df['game-title'] == 'The Elder Scrolls V Skyrim'

g = steam_df

g = g.loc[cond & cond2]

g = g.sort_values(by='value', ascending=False).reset_index()


display(g['value'].mean())

104.71093057607091

**Task 10.** Which games were played the most (in terms of the number of hours played)? Print the first 10 titles and respective numbers of hours.

In [52]:
# Write your code here
cond = steam_df['behavior-name'] == 'play'
g = steam_df[cond]
g = g.loc[:, ['game-title', 'value']]
g = g.groupby('game-title').sum()

g = g.sort_values(by='value', ascending=False).reset_index()


g.head(10)

Unnamed: 0,game-title,value
0,Dota 2,981684.6
1,Counter-Strike Global Offensive,322771.6
2,Team Fortress 2,173673.3
3,Counter-Strike,134261.1
4,Sid Meier's Civilization V,99821.3
5,Counter-Strike Source,96075.5
6,The Elder Scrolls V Skyrim,70889.3
7,Garry's Mod,49725.3
8,Call of Duty Modern Warfare 2 - Multiplayer,42009.9
9,Left 4 Dead 2,33596.7


**Task 11.** Which games are the most consistently played (in terms of the average number of hours played)? Print the first 10 titles and respective numbers of hours.

In [63]:
# Write your code here
cond = steam_df['behavior-name'] == 'play'
g = steam_df[cond]
g = g.loc[:, ['game-title', 'value']]
g = g.groupby('game-title')
g = g.mean()
g = g.sort_values(by='value', ascending=False).reset_index()


g.head(10)

Unnamed: 0,game-title,value
0,Eastside Hockey Manager,1295.0
1,Baldur's Gate II Enhanced Edition,475.255556
2,FIFA Manager 09,411.0
3,Perpetuum,400.975
4,Football Manager 2014,391.984615
5,Football Manager 2012,390.453165
6,Football Manager 2010,375.048571
7,Football Manager 2011,365.703226
8,Freaking Meatbags,331.0
9,Out of the Park Baseball 16,330.4


**Task 12\*\*.** Fix the above for the fact that 0 hours played is not listed, but only a purchase is recorded in such a case.

In [40]:
# Trzeba znalezc osoby i ich gry, ktore zostaly zakupione ale nie maja rekordu z
# behavior-name == play do poprawy na moodle


KeyError: "None of [Index([''g['user-id'].isin' is not recognized as an internal or external command,', 'operable program or batch file.'], dtype='object')] are in the [index]"

**Task 13.** Apply the sigmoid function
$$f(x) = \frac{1}{1 + e^{-\frac{1}{100}x}}$$
to hours played and print the first 10 rows from the entire Steam dataset after this change.

In [42]:
# Write your code here
cond = steam_df['behavior-name'] == 'play'

def sig(x):
    if x['behavior-name'] != 'play':
        return 1.0
    return 1/1 + np.exp(-x['value']/100)

g = steam_df.loc[cond].copy()
g['value'] = g.apply(lambda x: sig(x), axis=1)

g.head(10)


Unnamed: 0,user-id,game-title,behavior-name,value,zero
1,151603712,The Elder Scrolls V Skyrim,play,1.065219,0
3,151603712,Fallout 4,play,1.418952,0
5,151603712,Spore,play,1.861569,0
7,151603712,Fallout New Vegas,play,1.886034,0
9,151603712,Left 4 Dead 2,play,1.914846,0
11,151603712,HuniePop,play,1.918512,0
13,151603712,Path of Exile,play,1.922194,0
15,151603712,Poly Bridge,play,1.927743,0
17,151603712,Left 4 Dead,play,1.967539,0
19,151603712,Team Fortress 2,play,1.972388,0


## Pandas tasks - MovieLens dataset

**Task 14\*.** Calculate popularity (by the number of users who watched a movie) of all genres. Print a DataFrame with two columns: genre, n_users, where n_users contains the number of users who watched a given genre. Sort all genres in descending order.

In [46]:
# Write your code here
g = ml_df.loc[:].copy()
d = dict()

def f(x):
    stri = str(x).split('|')
    for y in stri:
        if y in d:
            d[y][0]  += 1
        else:
            d[y]  = [1]
            

g['genres'].apply(lambda x: f(x) )


g = g.groupby('genres').count().reset_index()
g = g.loc[:, ['genres', 'userId']]

df_d = {
    'genre': [],
    'n_users': []
}

for key in d:
    df_d['genre'].append(key)
    df_d['n_users'].append(d[key][0])

df = pd.DataFrame.from_dict(df_d)



df = df.sort_values(by='n_users', ascending=False).reset_index()

df = df.loc[:, ['genre', 'n_users']]
display(df)


Unnamed: 0,genre,n_users
0,Drama,41928
1,Comedy,39053
2,Action,30635
3,Thriller,26452
4,Adventure,24161
5,Romance,18124
6,Sci-Fi,17243
7,Crime,16681
8,Fantasy,11834
9,Children,9208


**Task 15\*.** Calculate average rating for all genres. Print a DataFrame with two columns: genre, rating, where rating contains the average rating for a given genre. Sort all genres in descending order.

In [54]:
# Write your code here
g = ml_df.loc[:].copy()
d = dict()

def f(x):
    stri = str(x['genres']).split('|')
    for y in stri:
        if y in d:
            d[y].append(x['rating'])
        else:
            d[y]  = [x['rating']]
            

g.apply(lambda x: f(x), axis=1 )


g = g.groupby('genres').count().reset_index()
g = g.loc[:, ['genres', 'userId']]

df_d = {
    'genre': [],
    'rating': []
}



for key in d:
    df_d['genre'].append(key)
    d[key][0] = np.array(d[key]).mean()
    df_d['rating'].append(d[key][0])

df = pd.DataFrame.from_dict(df_d)



df = df.sort_values(by='rating', ascending=False).reset_index()

df = df.loc[:, ['genre', 'rating']]
display(df)


Unnamed: 0,genre,rating
0,Film-Noir,3.920115
1,War,3.808294
2,Documentary,3.797785
3,Crime,3.658294
4,Drama,3.656184
5,Mystery,3.63246
6,Animation,3.629937
7,IMAX,3.618335
8,Western,3.583938
9,Musical,3.563678


**Task 17.** Calculate each movie rating bias (deviation from the mean of all movies average rating). Print first 10 in the form: title, average rating, bias.

In [65]:
# Write your code here
def deviation(x2, r2, n2):
    return np.sqrt(np.power(np.abs(x2 - r2),2)/n2)
g = ml_df.copy()

g = g.loc[:, ['title', 'rating']]

# g['average rating'] = g.groupby('title').sum()
d = g.groupby('title').mean()

r = g['rating'].mean()
n = len(g.index)


d = d.rename({'rating':'average_rating'}, axis=1)
d['bias'] = d.apply(lambda x:x.average_rating - r, axis=1)
d = d.reset_index()
d.head(10)


Unnamed: 0,title,average_rating,bias
0,'71 (2014),4.0,0.498443
1,'Hellboy': The Seeds of Creation (2004),4.0,0.498443
2,'Round Midnight (1986),3.5,-0.001557
3,'Salem's Lot (2004),5.0,1.498443
4,'Til There Was You (1997),4.0,0.498443
5,'Tis the Season for Love (2015),1.5,-2.001557
6,"'burbs, The (1989)",3.176471,-0.325086
7,'night Mother (1986),3.0,-0.501557
8,(500) Days of Summer (2009),3.666667,0.16511
9,*batteries not included (1987),3.285714,-0.215843


**Task 17.** Calculate each user rating bias (deviation from the mean of all users average rating). Print first 10 in the form: user_id, average rating, bias.

In [66]:
# Write your code here
# Write your code here
def deviation(x2, r2, n2):
    return np.sqrt(np.power(np.abs(x2 - r2),2)/n2)
g = ml_df.copy()

g = g.loc[:, ['userId', 'rating']]

# g['average rating'] = g.groupby('title').sum()
d = g.groupby('userId').mean()

r = g['rating'].mean()
n = len(g.index)


d = d.rename({'rating':'average rating', 'userId': 'user_id'}, axis=1)
d['bias'] = d.apply(lambda x: x-r)

d.head(10)

Unnamed: 0_level_0,average rating,bias
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.366379,0.864822
2,3.948276,0.446719
3,2.435897,-1.06566
4,3.555556,0.053999
5,3.636364,0.134807
6,3.493631,-0.007926
7,3.230263,-0.271294
8,3.574468,0.072911
9,3.26087,-0.240687
10,3.278571,-0.222986


**Task 18.** Randomly choose 10 movies and 10 users and print their interaction matrix in the form of a DataFrame with user_id as index and movie titles as columns. You can iterate over the DataFrame in this task.

In [97]:
# Write your code here do poprawy na moodle
g = ml_df.copy()


g1 = g.sample(n=10)


g1 = g1.loc[:]


display(g1)

d = {
    'user_id': [],
    'title': [],
    'values' : []
}

for i, row in g1.iterrows():
    d['user_id'].append(row['userId'])
    d['title'].append(row['title'])
    d['values'].append(1)


df = pd.DataFrame.from_dict(d)

df = pd.pivot_table(df, values='values', index = ['user_id'], columns='title', aggfunc=np.sum, fill_value='-')   

display(df)

/*
# Write your code here
g = ml_df.copy()


g1 = g.sample(n=10)


g1 = g1.loc[:]



d = {
    'user_id': [],
    'title': [],
    'values' : []
}

for i, row in g1.iterrows():
    d['user_id'].append(row['userId'])
    d['title'].append(row['title'])
    d['values'].append(1)


df = pd.DataFrame.from_dict(d)

df = pd.pivot_table(df, values='values', index = ['user_id'], columns='title', aggfunc=np.sum, fill_value='-')   


# print(df)

# używałem funkcji sample, i w sumie to możliwe, że nie za dobrze zrozumiałem polecenie

print(
    """     Father of the Bride (1950)  Matrix, The (1999)  Howards End (1992)  \\
11                          0.0                 0.0                 0.0
277                         0.0                 0.0                 0.0
286                         0.0                 1.0                 0.0
357                         0.0                 0.0                 0.0
414                         0.0                 1.0                 0.0
428                         0.0                 1.0                 0.0
470                         0.0                 0.0                 0.0
487                         0.0                 1.0                 0.0
543                         0.0                 1.0                 0.0
544                         0.0                 0.0                 0.0

     Thirteen Conversations About One Thing (a.k.a. 13 Conversations) (2001)  \\
11                                                 0.0
277                                                0.0
286                                                0.0
357                                                0.0
414                                                1.0
428                                                0.0
470                                                0.0
487                                                0.0
543                                                0.0
544                                                0.0

     It Runs in the Family (2003)  I Love You to Death (1990)  \\
11                            0.0                         0.0
277                           0.0                         0.0
286                           0.0                         0.0
357                           0.0                         0.0
414                           0.0                         0.0
428                           0.0                         0.0
470                           0.0                         0.0
487                           0.0                         0.0
543                           0.0                         0.0
544                           0.0                         0.0

     Bedazzled (1967)  I Don't Know How She Does It (2011)  \\
11                0.0                                  0.0
277               0.0                                  0.0
286               0.0                                  0.0
357               0.0                                  0.0
414               0.0                                  0.0
428               0.0                                  0.0
470               0.0                                  0.0
487               0.0                                  0.0
543               0.0                                  0.0
544               0.0                                  0.0

     Wind Rises, The (Kaze tachinu) (2013)  Night Guards (2016)
11                                     0.0                  0.0
277                                    0.0                  0.0
286                                    0.0                  0.0
357                                    0.0                  0.0
414                                    0.0                  0.0
428                                    0.0                  0.0
470                                    0.0                  0.0
487                                    0.0                  0.0
543                                    0.0                  0.0
544                                    0.0                  0.0"""
    )
*/

[596   1 293 493 572 325 149  75 547  56]
        userId  movieId  rating   timestamp      title  genres
100835     610   163981     3.5  1493850155  31 (2016)  Horror


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
13244,288,2640,3.5,1054569165,Superman (1978),Action|Adventure|Sci-Fi
33093,84,637,1.0,857653349,Sgt. Bilko (1996),Comedy
61599,480,2355,3.0,1179161045,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
22446,463,2019,4.0,1145460514,Seven Samurai (Shichinin no samurai) (1954),Action|Adventure|Drama
27836,606,19,2.0,1171814553,Ace Ventura: When Nature Calls (1995),Comedy
67450,21,56775,3.5,1427558186,National Treasure: Book of Secrets (2007),Action|Adventure
59984,590,1663,3.5,1258416870,Stripes (1981),Comedy|War
76357,42,1667,2.0,996221092,Mad City (1997),Action|Drama
23850,490,3481,4.0,1328145199,High Fidelity (2000),Comedy|Drama|Romance
32353,599,477,2.0,1519244415,What's Love Got to Do with It? (1993),Drama|Musical


title,Ace Ventura: When Nature Calls (1995),"Bug's Life, A (1998)",High Fidelity (2000),Mad City (1997),National Treasure: Book of Secrets (2007),Seven Samurai (Shichinin no samurai) (1954),Sgt. Bilko (1996),Stripes (1981),Superman (1978),What's Love Got to Do with It? (1993)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21,-,-,-,-,1.0,-,-,-,-,-
42,-,-,-,1.0,-,-,-,-,-,-
84,-,-,-,-,-,-,1.0,-,-,-
288,-,-,-,-,-,-,-,-,1.0,-
463,-,-,-,-,-,1.0,-,-,-,-
480,-,1.0,-,-,-,-,-,-,-,-
490,-,-,1.0,-,-,-,-,-,-,-
590,-,-,-,-,-,-,-,1.0,-,-
599,-,-,-,-,-,-,-,-,-,1.0
606,1.0,-,-,-,-,-,-,-,-,-


## Pandas + numpy tasks

**Task 19.** Create the entire interaction matrix for the MovieLens dataset. Print the submatrix of first 10 rows and 10 columns.

In [12]:
# Write your code here
g = ml_df.copy()

d = {
    'users': [],
    'titles' : []
}

def get_data(x):
    d['users'].append(x['userId'])
    d['titles'].append(x['title'])
    
g = g.sort_values(by='userId', ascending=True)
g.apply(lambda x: get_data(x), axis=1)

def check_it(x):
    #t = g.copy()
    #cond = t['userId'] == x['userId']
    pass
    
g = g.loc[:, ['userId', 'title']]
g = g.apply(lambda x: check_it(x), axis=1)

g.head(10)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



**Task 20.** Calculate the matrix of size (n_users, n_users) where at position (i, j) there is the number of movies watched both by user i and user j. Print the submatrix of first 10 rows and 10 columns.

In [None]:
# Write your code here

**Task 21.** Calculate the matrix of size (n_items, n_items) where at position (i, j) there is the number of users who watched both movie i and movie j. To prevent hanging your computer because of RAM shortage use only the first 1000 items. Print the submatrix of first 10 rows and 10 columns.

In [None]:
# Write your code here