# Calculate Basic Statistics of the Datasets

In [44]:
import gzip
import json
import numpy as np
import pandas as pd
import sweetviz as sv
from ydata_profiling import ProfileReport
import os
pd.options.display.float_format = '{:,}'.format

**Specify data directory here:**

In [13]:
BASE_DATA_DIR = '/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data'

**Load the data**

In [14]:
# Load Book
book_df = pd.read_csv(os.path.join(BASE_DATA_DIR, 'interim','books_data.csv'))

# load Interactions
interactions_df = pd.read_csv(os.path.join(BASE_DATA_DIR,'interim', 'interactions_data.csv'))

# Load User
user_df = pd.read_csv(os.path.join(BASE_DATA_DIR,'interim', 'users_data.csv'), )

## Missing Values Analysis and Data Type Analysis

In [25]:
book_df.isna().sum()

book_id           0
title             1
ratings_count     0
country_code      0
is_ebook          0
average_rating    0
dtype: int64

In [32]:
interactions_df.isna().sum(axis=0)

user_id                        0
book_id                        0
review_id                      0
is_read                        0
rating                         0
review_text_incomplete    188374
date_added                     0
dtype: int64

In [20]:
user_df.describe()

Unnamed: 0,Age
count,20627.0
mean,58.73917680709749
std,23.81057756047336
min,18.0
25%,38.0
50%,59.0
75%,80.0
max,99.0


## Statistical Analysis

In [12]:
book_df.describe()

Unnamed: 0,text_reviews_count,average_rating,book_id,ratings_count,work_id
count,36514.0,36514.0,36514.0,36514.0,36514.0
mean,14.690885687681437,4.063837979952895,10634515.447965164,279.68820178561646,13401612.114421863
std,110.59437369886793,0.399964810487884,10353449.4589185,7633.414462692745,17049736.749568902
min,0.0,0.0,234.0,0.0,166.0
25%,2.0,3.84,1185513.75,9.0,930767.0
50%,4.0,4.1,7223308.5,23.0,3284191.0
75%,9.0,4.31,18218723.75,69.0,21655329.0
max,10403.0,5.0,36485479.0,1029527.0,58229635.0


In [61]:
user_df.describe()

Unnamed: 0,Age
count,20627.0
mean,58.73917680709749
std,23.81057756047336
min,18.0
25%,38.0
50%,59.0
75%,80.0
max,99.0


In [15]:
interactions_df.describe()

Unnamed: 0,book_id,rating
count,2734350.0,2734350.0
mean,6808743.628733703,1.8247868780514565
std,9698380.71917693,2.123223481801896
min,234.0,0.0
25%,42040.0,0.0
50%,592221.0,0.0
75%,12193298.0,4.0
max,36485479.0,5.0


# Fine grain data based on statistical analysis

**Since we found that missing value in `review_text_incomplete` is around 94% so it might not add singnificant contribution for predictive modeling.**

In [40]:
interactions_df.drop(['review_text_incomplete'],inplace=True, axis=1)

In [41]:
interactions_df.to_csv('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/data/interim/interactions_data.csv', index=False)

## Generating an visualization Report of the data

In [57]:
book_profile = ProfileReport(book_df)
book_profile.to_file('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/reports/book_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [58]:
interactions_profile = ProfileReport(interactions_df, title="Pandas Profiling Report for Interactions")
interactions_profile.to_file('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/reports/interactions_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [63]:
reviews_profile = ProfileReport(user_df, title="Pandas Profiling Report for Users")
reviews_profile.to_file('/Users/spartan/Projects/Deep-Learning-Based-Group-Recommender-System/reports/reviews_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]