### Q2 - Data Analyzing
This notebook performs:
- **Data Analyzing** on `cleaned_books_data.csv` (Part C)

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression

In [2]:
# load the dataset
df = pd.read_csv('G:\\My Drive\\MSC\\project\\question2_social_media_analysis\\data_processing\\cleaned_books_data.csv')
df

Unnamed: 0,title,price,rating,category,availability,description
0,A Light in the Attic,51.77,3,Poetry,In stock (22 available),It's hard to imagine a world without A Light i...
1,Tipping the Velvet,53.74,1,Historical Fiction,In stock (20 available),"""Erotic and absorbing...Written with starling ..."
2,Soumission,50.10,1,Fiction,In stock (20 available),"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,47.82,4,Mystery,In stock (20 available),"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,54.23,5,History,In stock (20 available),From a renowned historian comes a groundbreaki...
...,...,...,...,...,...,...
993,Beyond Good and Evil,43.38,1,Philosophy,In stock (1 available),Friedrich Nietzsche's Beyond Good and Evil is ...
994,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,4,Sequential Art,In stock (1 available),High school student Kei Nagai is struck dead i...
995,A Spy's Devotion (The Regency Spies of London #1),16.97,5,Historical Fiction,In stock (1 available),"In England’s Regency era, manners and elegance..."
996,1st to Die (Women's Murder Club #1),53.98,1,Mystery,In stock (1 available),"James Patterson, bestselling author of the Ale..."


In [3]:
# Compute basic statistics
numeric_cols = ['price', 'rating']

basic_stats = df[numeric_cols].describe().T
basic_stats['median'] = df[numeric_cols].median()
basic_stats['mode'] = df[numeric_cols].mode().iloc[0]

print("Basic Statistics:")
basic_stats

Basic Statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max,median,mode
price,998.0,35.052926,14.446311,10.0,22.1025,35.98,47.4075,59.99,35.98,16.28
rating,998.0,2.92485,1.435111,1.0,2.0,3.0,4.0,5.0,3.0,1.0


In [5]:
# Detect outliers in 'price' using IQR method
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
print(f"Outliers in 'price':{len(outliers)} records found.")
print(outliers)

Outliers in 'price':0 records found.
Empty DataFrame
Columns: [title, price, rating, category, availability, description]
Index: []


In [6]:
# correlation matrix for numeric columns
corr, p_value = stats.pearsonr(df['price'], df['rating'])
print(f"Pearson correlation between 'price' and 'rating': {corr}, p-value: {p_value}")

Pearson correlation between 'price' and 'rating': 0.030137094444841823, p-value: 0.3415597130301688


In [7]:
# compare prices between top and bottom 10% rated books
top_10_percent = df['rating'].quantile(0.9)
bottom_10_percent = df['rating'].quantile(0.1)
top_rated = df[df['rating'] >= top_10_percent]['price']
bottom_rated = df[df['rating'] <= bottom_10_percent]['price']
t_stat, p_val = stats.ttest_ind(top_rated, bottom_rated, equal_var=False)
print(f"T-test between top and bottom 10% rated books' prices: t-statistic={t_stat}, p-value={p_val}")

T-test between top and bottom 10% rated books' prices: t-statistic=0.6384339581667415, p-value=0.5235563334739943


In [8]:
# simple linear regression: predicting price based on rating
x = df[['rating']]
y = df['price']
model = LinearRegression()
model.fit(x, y)
print(f"Linear Regression: price = {model.intercept_}")
print(f"Coefficient for rating: {model.coef_[0]}")
print(f"R^2: {model.score(x, y)}")

Linear Regression: price = 34.16561350772499
Coefficient for rating: 0.3033702361392453
R^2: 0.0009082444615772234
