In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [6]:
# Load the dataset
data=pd.read_csv(r"C:\Users\chall\Downloads\cost_of_living.csv")  # replace with your dataset


In [8]:
data.head()

Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
0,Switzerland,101.1,46.5,74.9,109.1,97.0,158.7
1,Bahamas,85.0,36.7,61.8,81.6,83.3,54.6
2,Iceland,83.0,39.2,62.0,88.4,86.8,120.3
3,Singapore,76.7,67.2,72.1,74.6,50.4,111.1
4,Barbados,76.6,19.0,48.9,80.8,69.4,43.5


In [10]:
data.tail()

Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
116,Bangladesh,22.5,2.4,12.8,25.7,12.8,33.1
117,India,21.2,5.6,13.7,23.8,15.1,82.6
118,Egypt,21.0,3.7,12.7,21.2,16.2,20.0
119,Libya,20.4,4.3,12.7,22.2,15.2,42.0
120,Pakistan,18.8,2.8,11.1,17.5,12.9,29.1


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 7 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Country                         121 non-null    object 
 1   Cost of Living Index            121 non-null    float64
 2   Rent Index                      121 non-null    float64
 3   Cost of Living Plus Rent Index  121 non-null    float64
 4   Groceries Index                 121 non-null    float64
 5   Restaurant Price Index          121 non-null    float64
 6   Local Purchasing Power Index    121 non-null    float64
dtypes: float64(6), object(1)
memory usage: 6.7+ KB


In [30]:
data.shape

(121, 7)

In [16]:
data.isnull().sum()

Country                           0
Cost of Living Index              0
Rent Index                        0
Cost of Living Plus Rent Index    0
Groceries Index                   0
Restaurant Price Index            0
Local Purchasing Power Index      0
dtype: int64

In [18]:
data.isnull().any()

Country                           False
Cost of Living Index              False
Rent Index                        False
Cost of Living Plus Rent Index    False
Groceries Index                   False
Restaurant Price Index            False
Local Purchasing Power Index      False
dtype: bool

In [32]:
data.columns

Index(['Country', 'Cost of Living Index', 'Rent Index',
       'Cost of Living Plus Rent Index', 'Groceries Index',
       'Restaurant Price Index', 'Local Purchasing Power Index'],
      dtype='object')

In [28]:
data.describe()

Unnamed: 0,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
count,121.0,121.0,121.0,121.0,121.0,121.0
mean,43.555372,16.052893,30.357851,44.228926,36.471074,65.094215
std,16.147574,11.412267,13.263721,17.055109,18.25811,39.569094
min,18.8,2.4,11.1,17.5,12.8,2.3
25%,30.2,8.5,19.8,31.6,21.6,34.8
50%,39.5,12.4,27.0,40.5,33.1,50.6
75%,52.8,20.1,37.0,53.7,47.2,99.4
max,101.1,67.2,74.9,109.1,97.0,182.5


In [14]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cost of Living Index,121.0,43.555372,16.147574,18.8,30.2,39.5,52.8,101.1
Rent Index,121.0,16.052893,11.412267,2.4,8.5,12.4,20.1,67.2
Cost of Living Plus Rent Index,121.0,30.357851,13.263721,11.1,19.8,27.0,37.0,74.9
Groceries Index,121.0,44.228926,17.055109,17.5,31.6,40.5,53.7,109.1
Restaurant Price Index,121.0,36.471074,18.25811,12.8,21.6,33.1,47.2,97.0
Local Purchasing Power Index,121.0,65.094215,39.569094,2.3,34.8,50.6,99.4,182.5


In [20]:
data.columns

Index(['Country', 'Cost of Living Index', 'Rent Index',
       'Cost of Living Plus Rent Index', 'Groceries Index',
       'Restaurant Price Index', 'Local Purchasing Power Index'],
      dtype='object')

In [22]:
data.index

RangeIndex(start=0, stop=121, step=1)

In [24]:
data.index = list(data.index)

In [26]:
data.index

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       111, 112, 113, 114, 115, 116, 117, 118, 119, 120],
      dtype='int64', length=121)

In [28]:
data["Country"].value_counts()

Country
Switzerland    1
Montenegro     1
Ghana          1
Russia         1
Philippines    1
              ..
Qatar          1
Estonia        1
Greece         1
Bahrain        1
Pakistan       1
Name: count, Length: 121, dtype: int64

In [30]:
d = (data[data['Cost of Living Index'] > 90])
d

Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
0,Switzerland,101.1,46.5,74.9,109.1,97.0,158.7


In [32]:
f = (data[data["Country"] == "India"])
f

Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
117,India,21.2,5.6,13.7,23.8,15.1,82.6


In [34]:
# to get top 5 country based on cost of living
sorted_df = data.sort_values(by='Cost of Living Index', ascending=False)

In [36]:
sorted_df.head()

Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
0,Switzerland,101.1,46.5,74.9,109.1,97.0,158.7
1,Bahamas,85.0,36.7,61.8,81.6,83.3,54.6
2,Iceland,83.0,39.2,62.0,88.4,86.8,120.3
3,Singapore,76.7,67.2,72.1,74.6,50.4,111.1
4,Barbados,76.6,19.0,48.9,80.8,69.4,43.5


In [38]:
sorted_df.tail()

Unnamed: 0,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
116,Bangladesh,22.5,2.4,12.8,25.7,12.8,33.1
117,India,21.2,5.6,13.7,23.8,15.1,82.6
118,Egypt,21.0,3.7,12.7,21.2,16.2,20.0
119,Libya,20.4,4.3,12.7,22.2,15.2,42.0
120,Pakistan,18.8,2.8,11.1,17.5,12.9,29.1


In [54]:
data.columns

Index(['Cost of Living Index', 'Rent Index', 'Cost of Living Plus Rent Index',
       'Groceries Index', 'Restaurant Price Index',
       'Local Purchasing Power Index', 'Country_Value'],
      dtype='object')

In [72]:
cat_cols =['Cost of Living Index', 'Rent Index', 'Cost of Living Plus Rent Index',
       'Groceries Index', 'Restaurant Price Index',
       'Local Purchasing Power Index', 'Country_Value'] 
i=0
while i < 8:
    fig = plt.figure(figsize=[14,7])
    #ax1 = fig.add_subplot(121)
    #ax2 = fig.add_subplot(122)
    
    #ax1.title.set_text(cat_cols[i])
    plt.subplot(1,2,1)
    sns..countplot(x=cat_cols[i], data=data)
    i += 1
    
    #ax2.title.set_text(cat_cols[i])
    plt.subplot(1,2,2)
    sns..countplot(x=cat_cols[i], data=data)
    i += 1
    
    plt.show()

SyntaxError: invalid syntax (4242507454.py, line 12)

In [40]:
from sklearn.preprocessing import LabelEncoder

In [42]:
# Initialize the label encoder
label_encoder = LabelEncoder()
# Fit and transform the data
data["Country_Value"] = label_encoder.fit_transform(data["Country"])

In [44]:
data.drop(columns= "Country",axis=1, inplace=True)

In [46]:
X = data.drop(columns= "Cost of Living Index")
print(X)

     Rent Index  Cost of Living Plus Rent Index  Groceries Index  \
0          46.5                            74.9            109.1   
1          36.7                            61.8             81.6   
2          39.2                            62.0             88.4   
3          67.2                            72.1             74.6   
4          19.0                            48.9             80.8   
..          ...                             ...              ...   
116         2.4                            12.8             25.7   
117         5.6                            13.7             23.8   
118         3.7                            12.7             21.2   
119         4.3                            12.7             22.2   
120         2.8                            11.1             17.5   

     Restaurant Price Index  Local Purchasing Power Index  Country_Value  
0                      97.0                         158.7            103  
1                      83.3      

In [48]:
y =data["Cost of Living Index"]
print(y)

0      101.1
1       85.0
2       83.0
3       76.7
4       76.6
       ...  
116     22.5
117     21.2
118     21.0
119     20.4
120     18.8
Name: Cost of Living Index, Length: 121, dtype: float64


In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [52]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')

MSE: 0.005131148757251138
RMSE: 0.07163203722672655
R^2: 0.999983358291959
