In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

Unggah file

In [None]:
df = pd.read_csv('/content/steam_sales.csv')
df.head()

data cleaning

In [None]:
df.info()

In [None]:
df.columns.str.lower()

In [None]:
new_col = df.columns.str.lower().str.replace(' ', '_')
df.columns = new_col
df.head()

cek duplikat

In [None]:
df.duplicated().sum()

drop kolom yg kurang relevan

In [None]:
df.drop(columns=['fetched_at'], inplace=True)

In [None]:
df.drop(columns=['game_name'], inplace=True)

In [None]:
df.drop(columns=['release_date'], inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Kolom Reviews hapus koma dan spasi, lalu konversi ke integer
df['#reviews'] = df['#reviews'].str.replace(',', '').str.strip()
df['#reviews'] = pd.to_numeric(df['#reviews'], errors='coerce').astype('int64')

In [None]:
df.info()

In [None]:
df = df.dropna(subset=['rating', '#reviews', 'discount%', "price_(€)", "original_price_(€)"])

In [None]:
df.info()

In [None]:
missing_values = df.isnull().sum()
print('Missing values per column:')
print(missing_values)

In [None]:
missing_values = df.isnull().sum()

# **EDA**

deskriptif statistik

In [None]:
df.describe()

Korelasi antar fitur

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr = df.corr(numeric_only=True)
plt.figure(figsize=(10,6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Korelasi antar fitur')
plt.show()

pairplot

In [None]:
features = ['rating', '#reviews', 'discount%', 'original_price_(€)', 'price_(€)', 'windows', 'linux', 'macos']
sns.pairplot(df[features].dropna())

scatter plot price dan jumlah reviews dengan log scale

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='#reviews', y='price_(€)', alpha=0.6, color='darkorange')
plt.xscale('log')
plt.title('Scatter Plot: Jumlah Review vs Harga (€) [Log Scale]')
plt.xlabel('Jumlah Review (log scale)')
plt.ylabel('Harga (€)')
plt.grid(True)
plt.show()


scatter plot price dan rating

In [None]:
# Scatter plot: rating vs price
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='rating', y='price_(€)', alpha=0.6, color='teal')
plt.title('Scatter Plot: Rating vs Harga (€)')
plt.xlabel('Rating')
plt.ylabel('Harga (€)')
plt.grid(True)
plt.show()

bar chart

In [None]:
# Hitung jumlah game yang support tiap platform
platform_counts = {
    'Windows': df['windows'].sum(),
    'Linux': df['linux'].sum(),
    'MacOS': df['macos'].sum()
}

# Plot bar chart
plt.figure(figsize=(8,5))
plt.bar(platform_counts.keys(), platform_counts.values(), color=['skyblue', 'lightgreen', 'salmon'])
plt.title('Jumlah Game per Platform')
plt.ylabel('Jumlah Game')
plt.xlabel('Platform')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Analisis **Predictive**

In [None]:
features = ['rating', '#reviews', 'discount%', 'original_price_(€)', 'windows', 'linux', 'macos']
target = 'price_(€)'

In [None]:
X = df[features]
y = df[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Evaluasi Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [None]:
# Evaluasi
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

# Cetak hasil
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.2%}")


In [None]:
import matplotlib.pyplot as plt

# Scatter plot: Harga Asli vs Prediksi
plt.figure(figsize=(8, 5))
plt.scatter(y_test, y_pred, alpha=0.6, color='mediumseagreen', edgecolors='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--', color='crimson', label='Garis Ideal')
plt.xlabel('Harga Asli (€)')
plt.ylabel('Harga Prediksi (€)')
plt.title('Prediksi vs Harga Asli')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()