# Python for Data Professional Beginner - Part 2

## Practice 1: Collection Manipulation with Python
Calculating average expenses and income of the last 9 months.

In [1]:
finance = {
'expenses': [2, 2.5, 2.25, 2.5, 3.2, 2.5, 3.5, 4, 3],
'income': [7.8, 7.5, 9, 7.6, 7.2, 7.5, 7, 10, 7.5]
}  # in million rupiahs

# Calculate the totals
total_expenses = 0
total_income = 0
for monthly_expenses in finance['expenses']: 
    total_expenses += monthly_expenses
for monthly_income in finance['income']: 
    total_income += monthly_income

# Calculate the averages
avg_expenses = total_expenses / len(finance['expenses'])
avg_income = total_income / len(finance['income'])
print(
    f'Average monthly expenses: Rp{avg_expenses:.2f} million'
    f'\nAverage monthly income  : Rp{avg_income:.2f} million'
)

Average monthly expenses: Rp2.83 million
Average monthly income  : Rp7.90 million


## Practice 2: String Manipulation with Python

Aksara was asked to study the popularity of *salak* (snake fruit) and *jeruk* (orange) based on articles that were published in the magazine *Buah Sehat* (Healthy Fruits). She also needs to determine how many articles whose titles contain positive connotation about each fruit.

In [2]:
articles = [
    'Buah Salak Baik untuk Mata',
    'Buah Salak Kaya Potasium',
    'Buah Jeruk Kaya Vitamin C',
    'Buah Salak Kaya Manfaat',
    'Salak Baik untuk Jantung',
    'Jeruk dapat Memperkuat Tulang',
    'Jeruk Mencegah Penyakit Asma',
    'Jeruk Memperkuat Gigi',
    'Jeruk Mencegah Kolesterol Jahat',
    'Salak Mencegah Diabetes',
    'Salak Memperkuat Dinding Usus',
    'Salak Baik untuk Darah',
    'Jeruk Kaya Manfaat untuk Jantung',
    'Salak si Kecil yang Baik',
    'Jeruk dan Salak Buah Kaya Manfaat',
    'Buah Jeruk Enak',
    'Tips Panen Jeruk Ribuan Kilo',
    'Tips Bertanam Salak',
    'Salak Manis untuk Berbuka',
    'Jeruk Baik untuk Wajah',
]
jeruk_articles = 0
salak_articles = 0
for title in articles:
    if title.count('Jeruk') > 0:
        jeruk_articles += 1
    if title.count('Salak') > 0:
        salak_articles += 1

positive_words = [
    'Kaya', 'Baik', 'Mencegah', 'Memperkuat',
]
jeruk_positive_articles = 0
salak_positive_articles = 0
for title in articles:
    for word in positive_words:
        if title.count('Jeruk') > 0 and title.count(word) > 0: 
            jeruk_positive_articles += 1
        if title.count('Salak') > 0 and title.count(word) > 0:
            salak_positive_articles += 1
print(
    f'Total articles               : {len(articles)}'
    f'\nJeruk articles count         : {jeruk_articles}'
    f'\nSalak articles count         : {jeruk_articles}'
    f'\nPositive articles about jeruk: {jeruk_positive_articles}'
    f'\nPositive articles about salak: {salak_positive_articles}'
)

Total articles               : 20
Jeruk articles count         : 10
Salak articles count         : 10
Positive articles about jeruk: 8
Positive articles about salak: 9


## Practice 3: Functions
Creating a function to calculate the average and standard deviation of data from a table.

|Lot area (m<sup>2</sup>)|House area (m<sup>2</sup>)|Distance to city (km)|Price (Rp, hundred millions)|
|-|-|-|-|
|70|50|15|500|
|70|60|30|400|
|70|60|55|300|
|100|50|30|700|
|100|70|25|1000|
|100|70|50|650|
|120|100|20|2000|
|120|80|50|1200|
|150|100|50|1800|
|150|90|15|3000|

In [3]:
# Translate the table into a dictionary
property_table = {
    'lot_area': [70, 70, 70, 100, 100, 100, 120, 120, 150, 150],
    'house_area': [50, 60, 60, 50, 70, 70, 100, 80, 100, 90],
    'city_distance': [15, 30, 55, 30, 25, 50, 20, 50, 50, 15],
    'price': [500, 400, 300, 700, 1000, 650, 2000, 1200, 1800, 3000]
}

def calculate_averages(data):
    total = 0
    for item in data:
        total += item
    average = total/len(data)
    return average

def calculate_stdev(data):
    data_average = calculate_averages(data)
    variance = 0
    for item in data:
        variance += (item - data_average) ** 2
        variance /= len(data)
    stdev = variance ** (1/2)
    return stdev

# Function to calculate average and standard deviation
# of each column in the property_table
def property_description(table):
    for key in table.keys():
        print(
            f'Average {key}: {calculate_averages(table[key]):.2f}'
            f'\nStandard deviation: {calculate_stdev(table[key]):.2f}\n'
        )

property_description(property_table)  # Call the last function

Average lot_area: 105.00
Standard deviation: 14.93

Average house_area: 73.00
Standard deviation: 6.03

Average city_distance: 34.00
Standard deviation: 6.24

Average price: 1155.00
Standard deviation: 587.06



## Mini Quiz: Predicting House Prices in Tangerang

In [4]:
with open("tangerang_house_price.txt", "r") as file:
    house_data = file.readlines()

house_price = []  # List of dictionaries
house_price_keys = house_data[0].replace("\n","").split(",")
for line in house_data[1:]:
    house_price_line = line.replace("\n","").split(",")
    house_price_dict = dict()
    for i in range(len(house_price_line)):
        house_price_dict[house_price_keys[i]] = house_price_line[i]
    house_price.append(house_price_dict)
print(house_price)

[{'lot_area': '70', 'house_area': '50', 'city_distance': '15', 'price': '500'}, {'lot_area': '70', 'house_area': '60', 'city_distance': '30', 'price': '400'}, {'lot_area': '70', 'house_area': '60', 'city_distance': '55', 'price': '300'}, {'lot_area': '100', 'house_area': '50', 'city_distance': '30', 'price': '700'}, {'lot_area': '100', 'house_area': '70', 'city_distance': '25', 'price': '1000'}, {'lot_area': '100', 'house_area': '70', 'city_distance': '50', 'price': '650'}, {'lot_area': '120', 'house_area': '100', 'city_distance': '20', 'price': '2000'}, {'lot_area': '120', 'house_area': '80', 'city_distance': '50', 'price': '1200'}, {'lot_area': '150', 'house_area': '100', 'city_distance': '50', 'price': '1800'}, {'lot_area': '150', 'house_area': '90', 'city_distance': '15', 'price': '3000'}]


In [5]:
def get_all_specified_attributes(list_of_dict, specified_key):
    list_attributes = []
    for data in list_of_dict:
        attribute = data[specified_key]
        list_attributes.append(attribute)
    return list_attributes

def min_value(list_attributes):
    min_attribute = 9999
    for attr in list_attributes:
        if int(attr) < min_attribute:
            min_attribute = int(attr)
    return min_attribute

def max_value(list_attributes):
    max_attribute = -9999
    for attr in list_attributes:
        if int(attr) > max_attribute:
            max_attribute = int(attr)
    return max_attribute

def transform_attribute(attr, max_attr, min_attr):
    transformed_value = (attr - min_attr) / (max_attr - min_attr)
    return transformed_value

def data_transformation(list_of_dict, list_attribute_names):
    attr_info = {}
    for attr_name in list_attribute_names:
        specified_attributes = get_all_specified_attributes(list_of_dict, attr_name)
        max_attr = max_value(specified_attributes)
        min_attr = min_value(specified_attributes)
        attr_info[attr_name] = {'max': max_attr, 'min': min_attr}
        data_idx = 0
        while(data_idx < len(list_of_dict)):
            list_of_dict[data_idx][attr_name] = transform_attribute(
                int(list_of_dict[data_idx][attr_name]), max_attr, min_attr)
            data_idx += 1
    return list_of_dict, attr_info

def transform_data(data, attr_info):
    for key_name in data.keys():
        data[key_name] = (
            data[key_name] - attr_info[key_name]['min']) / (
            attr_info[key_name]['max'] - attr_info[key_name]['min'])
    return data

def abs_value(value):
    if value < 0:
        return -value
    else:
        return value

def price_based_on_similarity(data, list_of_data):
    price_prediction = 0
    smallest_difference = 999
    for data_point in list_of_data:
        difference = abs_value(data['lot_area'] - data_point['lot_area'])
        difference += abs_value(data['house_area'] - data_point['house_area'])
        difference += abs_value(data['city_distance'] - data_point['city_distance'])
        if difference < smallest_difference:
            price_prediction = data_point['price']
            smallest_difference = difference
    return price_prediction

In [6]:
house_price, attr_info = data_transformation(
    house_price, ['lot_area', 'house_area', 'city_distance'])

data = {
    'lot_area': 110,
    'house_area': 80,
    'city_distance': 35
}
data = transform_data(data, attr_info)

pred_price = price_based_on_similarity(data, house_price)
print(f"House price prediction (Rp, hundred millions): {pred_price}")

House price prediction (Rp, hundred millions): 1200
