---


**Fundamentals of Artificial Intelligence**

Laboratory work Nr.2: Exploratory Data Analysis - pandas & matplotlib

Author: Eugeniu Popa, std.gr. FAF-202

Supervisor: Diana Marusic


---


In [None]:
# Task 1: Find the store that has the maximum sale recorded. Print the store id, date and the sales on that day.

import pandas as pd

# Load the train data
train_data = pd.read_csv('train.csv')

# Find the store with maximum sales
max_sale_store = train_data[train_data['Sales'] == train_data['Sales'].max()]

# Store the values in variables
store_id = max_sale_store['Store'].values[0]
date = max_sale_store['Date'].values[0]
sales = max_sale_store['Sales'].values[0]

# Create a table-like output
output = f"{'Store ID':<10} {'Date':<12} {'Sales':<10}"
output += f"\n{store_id:<10} {date:<12} {sales:<10}"

print(output)


In [None]:
# Task 2: Find the store(s) that has/ve the least possible and maximum possible competition distance(s).

# Load the store data
store_data = pd.read_csv('store.csv')

# Find the store(s) with the least and maximum competition distance
min_distance_store = store_data[store_data['CompetitionDistance'] == store_data['CompetitionDistance'].min()]
max_distance_store = store_data[store_data['CompetitionDistance'] == store_data['CompetitionDistance'].max()]

# Print the store id(s) and competition distance(s)
print("Store(s) with the least competition distance:")
print(min_distance_store[['Store', 'CompetitionDistance']])

print("\nStore(s) with the maximum competition distance:")
print(max_distance_store[['Store', 'CompetitionDistance']])


In [None]:
# Task 3: Check if there are any missing values in the dataset and output the number of missing values per each column.

# Check for missing values in the train_data and store_data dataframes
missing_values_train = train_data.isnull().sum()
missing_values_store = store_data.isnull().sum()

# Create a table-like output for missing values in train_data
output_train = f"{'Column Name':<20} {'Missing Values':<15}"
for column, missing_count in missing_values_train.items():
    output_train += f"\n{column:<20} {missing_count:<15}"

# Create a table-like output for missing values in store_data
output_store = f"{'Column Name':<20} {'Missing Values':<15}"
for column, missing_count in missing_values_store.items():
    output_store += f"\n{column:<20} {missing_count:<15}"

print("Missing values in train_data:")
print(output_train)

print("\nMissing values in store_data:")
print(output_store)


In [None]:
# Task 4: Plot the monthly mean of sales across all stores using matplotlib.

import matplotlib.pyplot as plt

# Convert the 'Date' column to datetime
train_data['Date'] = pd.to_datetime(train_data['Date'])

# Extract the month and year from the 'Date' column
train_data['Month'] = train_data['Date'].dt.month
train_data['Year'] = train_data['Date'].dt.year

# Group by month and year, and calculate the mean sales
monthly_mean_sales = train_data.groupby(['Year', 'Month'])['Sales'].mean()

# Create a plot
plt.figure(figsize=(10, 6))
monthly_mean_sales.plot(marker='o', linestyle='-')
plt.xlabel('Month')
plt.ylabel('Mean Sales')
plt.title('Monthly Mean Sales Across All Stores')
plt.grid(True)
plt.show()


In [None]:
# Task 5: Which store type (’a’,’b’ etc.) has had the most sales?

# Merge the train_data and store_data dataframes on the 'Store' column
merged_data = train_data.merge(store_data, on='Store', how='inner')

# Group by store type and calculate total sales
store_type_sales = merged_data.groupby('StoreType')['Sales'].sum()

# Find the store type with the most sales
most_sales_store_type = store_type_sales.idxmax()
print("Store type with the most sales:", most_sales_store_type)


In [None]:
# Task 6: What is the difference in the mean of sales (across all stores) when offering a Promo and not? Plot this data with matplotlib.

# Group by Promo and calculate the mean sales
promo_mean_sales = train_data.groupby('Promo')['Sales'].mean()

# Calculate the differences in mean sales between Promo and no Promo
promo_diffs = promo_mean_sales.diff()

# Create a bar plot
plt.figure(figsize=(10, 6))

# Plot the mean sales for Promo and no Promo
promo_mean_sales.plot(kind='bar', label='Mean Sales', color='blue', alpha=0.7)

# Plot the difference in mean sales as a separate column
promo_diffs.plot(kind='bar', label='Mean Sales Difference', color='green', alpha=0.7)

plt.xlabel('Promo')
plt.ylabel('Mean Sales')
plt.title('Mean Sales with and without Promo')
plt.xticks([0, 1], ['No Promo', 'Promo'], rotation=0)
plt.legend()
plt.grid(axis='y')
plt.show()


In [None]:
# Task 7: For the store with id 1, plot the mean sales per each day of week in a pie chart by using matplotlib.

# Filter data for store ID 1
store_1_data = train_data[train_data['Store'] == 1]

# Group by day of the week and calculate the mean sales
mean_sales_per_day = store_1_data.groupby(store_1_data['Date'].dt.dayofweek)['Sales'].mean()

# Create a pie chart
plt.figure(figsize=(6, 6))
plt.pie(mean_sales_per_day, labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], autopct='%1.1f%%')
plt.title('Mean Sales per Day of the Week for Store 1')
plt.show()


In [None]:
# Task 8: Plot the mean of sales across all the stores for each day of the week recorded in the dataset, by using matplotlib.

# Group by day of the week and calculate the mean sales
day_of_week_mean_sales = train_data.groupby('DayOfWeek')['Sales'].mean()

# Create a bar plot
plt.figure(figsize=(10, 8))
day_of_week_mean_sales.plot(kind='bar', color='blue', alpha=0.7)

# Add labels and title
plt.xlabel('Day of the Week')
plt.ylabel('Mean Sales')
plt.title('Mean Sales for Each Day of the Week')

# Customize the x-axis labels
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plt.xticks(range(7), day_labels, rotation=0)

# Display the values for each day on the plot
for i, v in enumerate(day_of_week_mean_sales):
    plt.text(i, v, f'{v:.2f}', ha='center', va='bottom', fontsize=10)

plt.grid(axis='y')
plt.show()


In [None]:
# Task 9: For the first 10 stores (first 10 ids), draw boxplots of their sales by using matplotlib.

# Filter data for the first 10 stores
first_10_stores_data = train_data[train_data['Store'].isin(range(1, 11))]

# Create boxplots of sales for the first 10 stores
plt.figure(figsize=(20, 18))
boxplot = plt.boxplot([first_10_stores_data[first_10_stores_data['Store'] == i]['Sales'] for i in range(1, 11)], labels=range(1, 11))
plt.xlabel('Store')
plt.ylabel('Sales')
plt.title('Boxplots of Sales for the First 10 Stores')
plt.grid(True)

# Print the values of sales for each store
for i, values in enumerate(boxplot['boxes']):
    store_sales = first_10_stores_data[first_10_stores_data['Store'] == i + 1]['Sales']
    plt.text(i + 1, max(store_sales) + 200, f'Mean: {store_sales.mean():.2f}\nMin: {store_sales.min()}\nMax: {store_sales.max()}', ha='center', va='bottom')

plt.show()
