In [24]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [25]:
df = pd.read_csv('datasets/laptop_prices.csv')

df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_in_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [26]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_in_euros    1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


In [27]:
fig = px.histogram(df, x='Company', title='Amount of laptops per brand')
fig.write_image('images/amount_of_laptops_per_brand.png')

fig.show()

In [28]:
fig = px.box(df, x='Company', y='Price_in_euros', title='Laptop prices by brand')

fig.write_image("images/laptop_prices_by_brand.png")

fig.show()

In [29]:
fig = px.box(df, x='TypeName', y='Price_in_euros', title='Laptop prices by type')

fig.write_image("images/laptop_prices_by_brand_and_gpu.png")

fig.show()

In [30]:
fig = px.box(df, x='TypeName', y='Price_in_euros', title='Laptop prices by brand and type', color='Company')

fig.write_image("images/laptop_prices_by_brand_and_type.png")

fig.show()

In [31]:
fig = px.histogram(df, x='Price_in_euros', title='Laptop prices distribution')

fig.write_image("images/laptop_prices_distribution.png")
fig.show()

In [32]:
from scipy.stats import shapiro

result = shapiro(df['Price_in_euros'])
alpha = 0.05

if result.pvalue < alpha:
    print("Prices are not normally distributed")
else:
    print("Prices are normally distributed")



Prices are not normally distributed


In [33]:
df_razer = df[df['Company'] == 'HP']

fix = px.histogram(df_razer, x='Price_in_euros', title='Laptop prices distribution')

fix.write_image("images/laptop_prices_distribution_by_brand.png")
fix.show()


In [34]:
from scipy.stats import ttest_ind

# H0: mean price of Lenovo laptops = mean price of Dell laptops

df_hp = df[df['Company'] == 'Lenovo']
df_dell = df[df['Company'] == 'Dell']

result = ttest_ind(df_hp['Price_in_euros'], df_dell['Price_in_euros'])
print('p-value:', result.pvalue)

alpha = 0.05

if result.pvalue < alpha:
    print("Mean price of Lenovo laptops is not equal to mean price of Dell laptops")
else:
    print("Mean price of Lenovo laptops is equal to mean price of Dell laptops")


p-value: 0.07702514743218304
Mean price of Lenovo laptops is equal to mean price of Dell laptops


In [35]:
df_need = df.groupby(['Company']).agg({'laptop_ID':'count'}).reset_index()

fig = px.pie(df_need,values = 'laptop_ID',
  names = 'Company',
  title = 'Company Distribution',
)

fig.update_traces(textposition='inside', textinfo='label+percent')
fig.write_image("images/laptop_by_brand_pie.png")

fig.show()

In [36]:
df_need = df.groupby(['OpSys']).agg({'laptop_ID':'count'}).reset_index()

fig = px.pie(df_need,values = 'laptop_ID',
  names = 'OpSys',
  title = 'Operating System Distribution',
)

fig.update_traces(textposition='inside', textinfo='label+percent')
fig.write_image("images/laptop_by_OpSys_pie.png")

fig.show()


In [37]:
df_android = df[df['OpSys'] == 'Android']

df_android.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_in_euros
50,51,Lenovo,Yoga Book,2 in 1 Convertible,10.1,IPS Panel Touchscreen 1920x1200,Intel Atom x5-Z8550 1.44GHz,4GB,64GB Flash Storage,Intel HD Graphics 400,Android,0.69kg,319.0
1114,1129,Lenovo,Yoga Book,2 in 1 Convertible,10.1,IPS Panel Touchscreen 1920x1200,Intel Atom x5-Z8550 1.44GHz,4GB,64GB Flash Storage,Intel HD Graphics 400,Android,0.69kg,549.0


In [38]:
df['Weight'] = df['Weight'].str.replace('kg', '')
df['Weight'] = df['Weight'].astype(float)

fig = px.histogram(df, x='Weight', title='Weight distribution of laptops')

fig.write_image("images/weight_distribution_of_Android_laptops.png")
fig.show()

In [39]:
fig = px.histogram(df, x='Inches', title='Inches distribution of laptops')

fig.write_image("images/inches_distribution_of_Android_laptops.png")
fig.show()

In [40]:
fig = px.box(df, x='Company', y='Inches', title='Screen size distribution of laptops by brand')

fig.write_image("images/weight_distribution_of_Android_laptops_by_brand.png")
fig.show()

In [41]:
from scipy.stats import pearsonr

result = pearsonr(df['Price_in_euros'], df['Inches'])
print('p-value:', result.pvalue)

alpha = 0.05

if result.pvalue < alpha:
  print("There is a correlation between price and screen resolution")
else:
  print("There is no correlation between price and screen resolution")


p-value: 0.013808550905012235
There is a correlation between price and screen resolution


In [42]:
df_need = df.groupby(['Product', 'Company']).agg({'laptop_ID':'count'}).reset_index()

fig = px.pie(df_need,
  values = 'laptop_ID',
  names = 'Company',
  title = 'Product Distribution',
)
fig.update_traces(textposition='inside', textinfo='label+percent')


fig.write_image("images/amount_of_product_variants.png")
fig.show()

In [43]:
df_need = df.groupby(['Product', 'Company']).agg({'laptop_ID':'count'}).reset_index()
df_need = df_need.sort_values(by=['laptop_ID'], ascending=False).head(10)

fig = px.pie(df_need,
  values = 'laptop_ID',
  names = 'Company',
  title = 'Product Distribution',
)
fig.update_traces(textposition='inside', textinfo='label+percent')


fig.write_image("images/amount_of_product_variants.png")
fig.show()

In [44]:
df_product_unique = df.drop_duplicates(subset=['Product'])

fig = px.histogram(df_product_unique, x='Company', title='Product distribution')

fig.write_image("images/product_distribution_unique.png")
fig.show()