In [23]:
import pandas as pd

# Load the data from the "coffee.csv" file
data = pd.read_csv("coffee.csv")

# Replace missing values in each column according to the criteria
clean_data = data.copy()

# Replace missing values in the "Region" column with "Unknown"
clean_data['Region'].fillna('Unknown', inplace=True)

# Replace missing values in the "Place name" column with "Unknown"
clean_data['Place name'].fillna('Unknown', inplace=True)

# Replace missing values in the "Place type" column with "Unknown"
clean_data['Place type'].fillna('Unknown', inplace=True)

# Replace missing values in the "Rating" column with 0
clean_data['Rating'].fillna(0, inplace=True)

# Replace missing values in the "Reviews" column with the overall median number of reviews
median_reviews = clean_data['Reviews'].median()
clean_data['Reviews'].fillna(median_reviews, inplace=True)

# Replace missing values in the "Price" column with "Unknown"
clean_data['Price'].fillna('Unknown', inplace=True)

# Replace missing values in the "Delivery Option" column with False
clean_data['Delivery option'].fillna(False, inplace=True)

# Replace missing values in the "Dine in Option" column with False
clean_data['Dine in option'].fillna(False, inplace=True)

# Replace missing values in the "Takeaway Option" column with False
clean_data['Takeout option'].fillna(False, inplace=True)

# Display the first few rows of the cleaned dataframe
print(clean_data.head())


  Region             Place name  ... Dine in option  Takeout option
0      C               Dim Kavu  ...          False           False
1      C                Коферум  ...          False            True
2      C       Кофейня Світ Чаю  ...          False            True
3      C       Кофейня Starcoff  ...           True            True
4      C  Кофейня "Friend Zone"  ...           True            True

[5 rows x 9 columns]


In [24]:
import pandas as pd

# Load the original data from the "coffee.csv" file
data = pd.read_csv("coffee.csv")

# Group the data by rating and calculate the median, minimum, and maximum reviews for each rating group
reviews_by_rating = data.groupby("Rating")["Reviews"].agg([pd.Series.median, min, max]).reset_index()

# Rename the columns
reviews_by_rating.columns = ['rating', 'med_review', 'min_review', 'max_review']

# Round the values to 1 decimal place
reviews_by_rating = reviews_by_rating.round({'med_review': 1, 'min_review': 1, 'max_review': 1})

# Display the resulting dataframe
print(reviews_by_rating)



    rating  med_review  min_review  max_review
0      3.9         9.5         9.0        10.0
1      4.0       804.5       170.0      1439.0
2      4.1       452.5       189.0       716.0
3      4.2       497.0       385.0       609.0
4      4.3       221.5         3.0      1656.0
5      4.4       536.0        40.0      1201.0
6      4.5       688.0        27.0      2914.0
7      4.6       693.0        11.0      2931.0
8      4.7       400.0        80.0     17937.0
9      4.8       137.5        10.0      2873.0
10     4.9        40.0        10.0      1820.0
11     5.0        18.0        10.0       440.0


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

training_data = pd.read_csv('train.csv')
model1 = LinearRegression()
validation_data = pd.read_csv('validation.csv')
y_train = training_data['Reviews']
X_train = training_data.drop(columns = ['Reviews', 'Place.name'])
X_train = pd.get_dummies(X_train, columns = ['Region', 'Place.type', 'Price'])
X_train['Rating'] = X_train['Rating'].fillna(0)
X_train['Dine.in.option'] = X_train['Dine.in.option'].fillna(False)
X_train['Takeout.option'] = X_train['Takeout.option'].fillna(False)
y_train = y_train.fillna(y_train.median())
model1.fit(X_train, y_train)
validation_data = validation_data.drop(columns = 'Place.name')
to_predict = pd.get_dummies(validation_data, columns = ['Region', 'Place.type', 'Price'])
to_predict['Rating'] = to_predict['Rating'].fillna(0)
to_predict['Dine.in.option'] = to_predict['Dine.in.option'].fillna(False)
to_predict['Takeout.option'] = to_predict['Takeout.option'].fillna(False)
to_predict['Price_$'] = 0
X_train_column_order = X_train.columns.tolist()
to_predict = to_predict[X_train_column_order]
to_predict
predictions = model1.predict(to_predict)
base_result = pd.DataFrame()
base_result['rating'] = predictions
base_result.index = ['Unknown'] * len(base_result)
base_result


Unnamed: 0,rating
Unknown,-736.44388
Unknown,293.987082
Unknown,181.744835
Unknown,-238.221997
Unknown,1329.204082
Unknown,875.668808
Unknown,1111.595061
Unknown,475.970022
Unknown,504.030583
Unknown,1511.908864


In [26]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

training_data = pd.read_csv('train.csv')
model1 = LogisticRegression()
validation_data = pd.read_csv('validation.csv')
y_train = training_data['Reviews']
X_train = training_data.drop(columns = ['Reviews', 'Place.name'])
X_train = pd.get_dummies(X_train, columns = ['Region', 'Place.type', 'Price'])
X_train['Rating'] = X_train['Rating'].fillna(0)
X_train['Dine.in.option'] = X_train['Dine.in.option'].fillna(False)
X_train['Takeout.option'] = X_train['Takeout.option'].fillna(False)
y_train = y_train.fillna(y_train.median())
model1.fit(X_train, y_train)
validation_data = validation_data.drop(columns = 'Place.name')
to_predict = pd.get_dummies(validation_data, columns = ['Region', 'Place.type', 'Price'])
to_predict['Rating'] = to_predict['Rating'].fillna(0)
to_predict['Dine.in.option'] = to_predict['Dine.in.option'].fillna(False)
to_predict['Takeout.option'] = to_predict['Takeout.option'].fillna(False)
to_predict['Price_$'] = 0
X_train_column_order = X_train.columns.tolist()
to_predict = to_predict[X_train_column_order]
to_predict
predictions = model1.predict(to_predict)
compare_result = pd.DataFrame()
compare_result['rating'] = predictions
compare_result.index = ['Unknown'] * len(base_result)
compare_result

Unnamed: 0,rating
Unknown,10.0
Unknown,18.0
Unknown,18.0
Unknown,10.0
Unknown,18.0
Unknown,12.0
Unknown,185.0
Unknown,18.0
Unknown,18.0
Unknown,18.0
