Mount Google Drive to access important SQL scripts

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


Change into project directory

In [2]:
%cd /content/drive/Shareddrives/big-data-drive/big-data-project/

/content/drive/Shareddrives/big-data-drive/big-data-project


Install mysql-server

In [3]:
! apt-get update
! apt-get install mysql-server

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com (185.125.190.83)] [Connected to cloud.r-project.org (108.139.1                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [2 InRelease 12.7 kB/128 kB 10%] [Waiting for headers] [Waiting for headers] [Waiting for headers                                                                                                    Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [2 InRelease 15.6 kB/128 kB 12%] [Waiting for headers] [3 InRelease 3,632 B/3,632 B 100%] [Waitin0% [2 InRelease 15.6 kB/128 kB 12%] [Waiting for headers] [Waiting for headers] [Waiting for headers                                                                                                    Hit:4 https://developer.download.nvidia.com/compute

Use mysql.connector and pandas

In [4]:
import pandas as pd

!pip install mysql-connector-python
import mysql.connector

import tracemalloc
import time

Collecting mysql-connector-python
  Downloading mysql_connector_python-9.2.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.0 kB)
Downloading mysql_connector_python-9.2.0-cp311-cp311-manylinux_2_28_x86_64.whl (34.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.0/34.0 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.2.0


Helper functions / global variables

In [5]:
db_name = "big_five"

def connect_to_mysql():
  # Create a connection to the MySQL server and use big five db
  conn = mysql.connector.connect(user="root", host="localhost", password="root")
  return conn

def create_cursor(conn):
  # Create a cursor to interact with the MySQL server
  cursor = conn.cursor()
  return cursor

# instantiate db
def connect_to_db(cursor, db_name):
  create_stmt = f"CREATE DATABASE IF NOT EXISTS {db_name}"
  use_stmt = f"USE {db_name}"
  cursor.execute(create_stmt)
  cursor.execute(use_stmt)

# store query results in dataframe for improved understanding / can pair with plotting frameworks
def store_query_results(cursor, query):
  cursor.execute(query)
  results = cursor.fetchall()
  columns = [desc[0] for desc in cursor.description]
  df = pd.DataFrame(results, columns=columns)
  return df

def store_query_results_with_placeholders(cursor, query, *args):
  cursor.execute(query, args)
  results = cursor.fetchall()
  columns = [desc[0] for desc in cursor.description]
  df = pd.DataFrame(results, columns=columns)
  return df

Start mysql server instance

In [6]:
!mysql --version
!service mysql start

mysql  Ver 8.0.41-0ubuntu0.22.04.1 for Linux on x86_64 ((Ubuntu))
 * Starting MySQL database server mysqld
   ...done.


If your output above contains "su: warning: cannot change directory to /nonexistent: No such file or directory", please run the code below and rerun the code above. Otherwise, please ignore.

In [7]:
!sudo service mysql stop
!sudo usermod -d /var/lib/mysql/ mysql
!sudo service mysql start
# force root to use mysql_native_password and sets passwd to root
# flush privileges to changes take place immediately
!mysql -e "ALTER USER 'root'@'localhost' IDENTIFIED WITH mysql_native_password BY 'root'; FLUSH PRIVILEGES;"

 * Stopping MySQL database server mysqld
   ...done.
 * Starting MySQL database server mysqld
   ...done.


Replicate database in notebook

In [8]:
import os
os.environ['MYSQL_PWD'] = "root"

tracemalloc.start()
start_time = time.time()

!mysql -u root -e "CREATE DATABASE big_five;"
!mysql -u root big_five < big_five_300k.sql

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

Current memory usage: 0.04 MB
Peak memory usage: 0.09 MB
Execution time: 242.17 seconds


In [9]:
# confirm replication of databases
# conn = connect_to_mysql()
# cursor = create_cursor(conn)
# connect_to_db(cursor, db_name)
# cursor.execute("SHOW TABLES")
# tables = cursor.fetchall()
# print(tables)

# cursor.execute("SELECT * FROM user")
# user_data = cursor.fetchall()
# print(user_data)

# cursor.close()
# conn.close()

Setup for querying database

In [10]:
# EXT
most_extroverted = ["EXT1", "EXT3", "EXT5", "EXT7", "EXT9"]
least_extroverted = ["EXT2", "EXT4", "EXT6", "EXT8", "EXT10"]

# AGR
least_agreeable = ["AGR1", "AGR3", "AGR5", "AGR7"]
most_agreeable = ["AGR2", "AGR4", "AGR6", "AGR8", "AGR10"]

# OPN
most_open = ["OPN1", "OPN3", "OPN5", "OPN7", "OPN8", "OPN9", "OPN10"]
least_open = ["OPN2", "OPN4", "OPN6"]

# CSN
most_conscientious = ["CSN1", "CSN3", "CSN5", "CSN7", "CSN9", "CSN10"]
least_conscientious = ["CSN2", "CSN4", "CSN6", "CSN8"]

# EST
# most neurotic is easiest agitated / disturbed
most_neurotic = ["EST1", "EST3", "EST5", "EST6", "EST7", "EST8", "EST9", "EST10"]
least_neurotic = ["EST2", "EST4"]

# Run before this cell for setup

## Queries

In [11]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# most common response values to each question and the frequency of responses
most_common_responses = """
  SELECT question_id, response_value, count
  FROM (
    SELECT question_id, response_value, COUNT(*) AS count,
           RANK() OVER (PARTITION BY question_id ORDER BY COUNT(*) DESC) AS rnk
    FROM response
    GROUP BY question_id, response_value
  ) ranked
  WHERE rnk=1;
"""

common_response_df = store_query_results(cursor, most_common_responses)
print(common_response_df)

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

    question_id  response_value   count
0             1               3   85507
1             2               3   72256
2             3               4   84183
3             4               3   82819
4             5               4   88510
5             6               2  100156
6             7               1   69601
7             8               4   81664
8             9               4   71659
9            10               5   92584
10           11               4   80164
11           12               4   83282
12           13               4  106508
13           14               2   85606
14           15               2   78822
15           16               2   75366
16           17               4   72722
17           18               2   77253
18           19               4   83906
19           20               2   75272
20           21               1  117639
21           22               4  104754
22           23               1  113160
23           24               5  110454


In [12]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# most common response values to each question based on screenw / screenh
# output (question_id, response_value, screenw, count)
# first join user and response table together
join_response_screenw = """
  CREATE OR REPLACE VIEW join_response_screenw AS
  SELECT r.question_id, r.response_value, u.screenw
  FROM response r
  JOIN user u
  ON r.user_id = u.user_id
"""
cursor.execute(join_response_screenw)
# explanation of partition
# partition is similar to group by in that it creates groupings of what is specified
# it is often used in tandem with functions like RANK(), ROW_NUMBER(), etc.
# in this case the partition is used to determined the granularity of ranking
# partitioning by question_id and screenw_range means we compute
# a rank for each question id, screenw_range pair, ex:
  # (id=1, w=0-700) -> rank 1
  # (id=1, w=1201-4000) -> rank 2
  # (id=1, w=701-1200) -> rank 3

# explanation of CASE
# used to create ranges of screenw values that we want to select over
most_common_response_by_screenw = """
  SELECT
    question_id,
    response_value,
    CASE
      WHEN screenw BETWEEN 0 AND 700 THEN '0-700'
      WHEN screenw BETWEEN 701 AND 1400 THEN '701-1200'
      WHEN screenw BETWEEN 1201 AND 4000 THEN '1201-4000'
      ELSE '4000+'
    END AS screenw_range,
    count
  FROM (
    SELECT
      question_id,
      response_value,
      screenw,
      COUNT(*) AS count,
      ROW_NUMBER() OVER (
        PARTITION BY
          question_id,
          CASE
            WHEN screenw BETWEEN 0 AND 700 THEN '0-700'
            WHEN screenw BETWEEN 701 AND 1400 THEN '701-1200'
            WHEN screenw BETWEEN 1201 AND 4000 THEN '1201-4000'
            ELSE '4000+'
          END
          ORDER BY COUNT(*) DESC) AS rnk
    FROM join_response_screenw
    GROUP BY
      question_id,
      response_value,
      screenw
  ) ranked
  WHERE rnk=1;
"""

common_response_screenw_df = store_query_results(cursor, most_common_response_by_screenw)
print(common_response_screenw_df)

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

     question_id  response_value screenw_range  count
0              1               1         0-700   8198
1              1               1     1201-4000   9675
2              1               2         4000+      3
3              1               3      701-1200  17381
4              2               3         0-700   6884
..           ...             ...           ...    ...
195           49               5      701-1200  24867
196           50               5         0-700  10600
197           50               5     1201-4000  13137
198           50               5         4000+      3
199           50               5      701-1200  20892

[200 rows x 4 columns]
Current memory usage: 0.11 MB
Peak memory usage: 0.14 MB
Execution time: 96.37 seconds


In [13]:
tracemalloc.start()
start_time = time.time()

# most common response values by country
conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

join_response_country = """
  CREATE OR REPLACE VIEW join_response_country AS
  SELECT r.question_id, r.response_value, u.country
  FROM response r
  JOIN user u
  ON r.user_id = u.user_id
"""
cursor.execute(join_response_country)

# row number assigns a unique rank to each row within a partition even if ties
most_common_response_by_country = """
  SELECT question_id, response_value, country, count
  FROM (
    SELECT question_id, response_value, country, COUNT(*) AS count,
           ROW_NUMBER() OVER (PARTITION BY question_id, country ORDER BY COUNT(*) DESC) AS rnk
    FROM join_response_country
    GROUP BY question_id, response_value, country
  ) ranked
  WHERE rnk=1
  ORDER BY count;
"""

common_response_country_df = store_query_results(cursor, most_common_response_by_country)
print(common_response_country_df)

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

       question_id  response_value country  count
0               28               3      FM      1
1               28               2      GA      1
2               28               4      GF      1
3               28               3      KM      1
4               28               4      LC      1
...            ...             ...     ...    ...
10245           44               1      US  67081
10246           43               5      US  68705
10247           21               1      US  74416
10248           49               5      US  75104
10249           46               1      US  76366

[10250 rows x 4 columns]
Current memory usage: 1.05 MB
Peak memory usage: 3.01 MB
Execution time: 128.03 seconds


In [14]:
tracemalloc.start()
start_time = time.time()

# Average duration to answer each question
conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# Average duration to answer each question
average_duration = """
  SELECT question_id, AVG(response_time) as duration
  FROM response
  GROUP BY question_id
"""

avg_duration_df = store_query_results(cursor, average_duration)
print(avg_duration_df)

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

    question_id    duration
0             1  86818.8677
1             2   8755.0227
2             3  10727.8612
3             4   7679.3029
4             5   7083.6661
5             6   6710.8217
6             7   8949.2075
7             8   6679.1732
8             9   5803.4873
9            10   5940.4367
10           11   8970.9609
11           12   8113.3056
12           13   7536.4100
13           14   8630.2397
14           15   8495.3615
15           16   8635.5481
16           17   6681.4870
17           18   5410.4982
18           19   5848.9069
19           20   4341.7137
20           21  16741.6144
21           22   8395.4880
22           23   6893.2311
23           24  10140.9138
24           25   8514.5853
25           26   6565.0148
26           27   9506.7836
27           28   7062.8020
28           29   5470.7558
29           30   6151.0363
30           31  12559.5818
31           32   9520.8978
32           33   9593.8340
33           34   8000.8493
34           35   78

In [15]:
# max duration to answer each question
conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# max duration to answer each question
max = """
  SELECT question_id, MAX(response_time)
  FROM response
  GROUP BY question_id
"""

max_duration_df = store_query_results(cursor, max)
print(max_duration_df)

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

    question_id  MAX(response_time)
0             1          1205348096
1             2           196659639
2             3           338495246
3             4            98150966
4             5           270215604
5             6           246406040
6             7            90544381
7             8            78912186
8             9            35249416
9            10           101168717
10           11           335110539
11           12            88979467
12           13           169998263
13           14            84457372
14           15           194734369
15           16           321131581
16           17            89522233
17           18            31299296
18           19           313106990
19           20            15810889
20           21           433601409
21           22           172690760
22           23            72625828
23           24           329878192
24           25            96295034
25           26           166636468
26           27           49

In [16]:
tracemalloc.start()
start_time = time.time()

# min duration to answer each question
conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# min duration to answer each question
min = """
  SELECT question_id, MIN(response_time)
  FROM response
  GROUP BY question_id
"""

min_duration_df = store_query_results(cursor, min)
# print(min_duration_df)

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

Current memory usage: 0.04 MB
Peak memory usage: 0.10 MB
Execution time: 69.24 seconds


Helper functions for separating questions by id

In [17]:
def get_question_ids_by_trait_category(curso, trait_categories):
  # f prior to triple quotes enables f-string formatting
  # ', '.join(...) joins placeholders with commas
  # ['%s'] * len(trait_categories) dynamically creates list of placeholders
  # values from trait_categories will substitute in for '%s'
  select_specific_questions = f"""
    SELECT question_id
    FROM question
    WHERE trait_category IN ({', '.join(['%s'] * len(trait_categories))})
  """

  # cursor expects arguments in tuple form
  cursor.execute(select_specific_questions, tuple(trait_categories))
  # returns tuples, id is first value of tuple
  question_ids = [question[0] for question in cursor.fetchall()]
  return question_ids

# view joins response and user table and filters resulting table based on a set of question ids
def create_view_of_selected_questions(cursor, view_name, question_ids):
    select_question_view = f"""
      CREATE OR REPLACE VIEW {view_name} AS
      SELECT r.question_id, r.response_value, u.country
      FROM response r
      JOIN user u ON r.user_id = u.user_id
      WHERE r.question_id IN ({', '.join(['%s'] * len(question_ids))})
    """

    cursor.execute(select_question_view, tuple(question_ids))

# counts the total number of respondents to each question
# in a particular country
# uses view created by function above
def count_per_country_df_from_view(cursor, view_name):
  count_per_country = f"""
    SELECT country, COUNT(*) as total_respondents
    FROM {view_name}
    GROUP BY country
  """

  country_count_df = store_query_results(cursor, count_per_country)
  return country_count_df

# creates a dataframe containing question, country, and
# count of responses between 4 and 5 for each question, country pair
def agreement_by_country(cursor, view_name):
  agreement_by_country = f"""
    SELECT
      question_id,
      country,
      COUNT(*) as count
    FROM
      {view_name}
    WHERE response_value BETWEEN 4 AND 5
    GROUP BY question_id, country;
  """

  agreement_by_country_df = store_query_results(cursor, agreement_by_country)
  return agreement_by_country_df

def merge_country_df_to_find_percentage(agreement_df, country_df):
  merged_country_df = agreement_df.merge(country_df, on='country')
  merged_country_df['percentage'] = merged_country_df['count'] / merged_country_df['total_respondents'] * 100

  return merged_country_df

## Queries analyzing how different countries respond to trait related questions

In [18]:
try:
  tracemalloc.start()
  start_time = time.time()

  conn = connect_to_mysql()
  cursor = create_cursor(conn)
  connect_to_db(cursor, db_name)

  # setup
  most_extroverted_ids = get_question_ids_by_trait_category(cursor, most_extroverted)
  most_extroverted_view = "most_extroverted_view"
  create_view_of_selected_questions(cursor, most_extroverted_view, most_extroverted_ids)
  country_count_df = count_per_country_df_from_view(cursor, most_extroverted_view)

  # find percentage of responses between 4/5 for each question id, country pair
  high_EXT_by_country_df = agreement_by_country(cursor, most_extroverted_view)
  print(high_EXT_by_country_df)

  high_EXT_merged_country_df = merge_country_df_to_find_percentage(high_EXT_by_country_df, country_count_df)
  print(high_EXT_merged_country_df)
  print(high_EXT_merged_country_df.loc[[high_EXT_merged_country_df["percentage"].idxmax()]])

  # Get memory usage details
  current, peak = tracemalloc.get_traced_memory()
  end_time = time.time()  # Stop execution timer

  # Print memory stats
  print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
  print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
  print(f"Execution time: {end_time - start_time:.2f} seconds")

  tracemalloc.stop()
except Exception as e:
  print("Error during database operations: ", e)
finally:
  if cursor:
    cursor.close()
  if conn:
    conn.commit()
    conn.close()

     question_id country  count
0              1      AD      1
1              1      AE    283
2              1      AF      6
3              1      AG      3
4              1      AL     38
..           ...     ...    ...
898            9      XK      1
899            9      YE      1
900            9      ZA    375
901            9      ZM     13
902            9      ZW      4

[903 rows x 3 columns]
     question_id country  count  total_respondents  percentage
0              1      AD      1                 30    3.333333
1              1      AE    283               4585    6.172301
2              1      AF      6                 90    6.666667
3              1      AG      3                 30   10.000000
4              1      AL     38                615    6.178862
..           ...     ...    ...                ...         ...
898            9      XK      1                 20    5.000000
899            9      YE      1                 10   10.000000
900            9      ZA 

In [19]:
tracemalloc.start()
start_time = time.time()

# repeats process used in finding responses to most extroverted questions by country
conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

least_extroverted_ids = get_question_ids_by_trait_category(cursor, least_extroverted)
least_extroverted_view = "least_extroverted_view"
create_view_of_selected_questions(cursor, least_extroverted_view, least_extroverted_ids)
country_count_df = count_per_country_df_from_view(cursor, least_extroverted_view)

low_EXT_by_country_df = agreement_by_country(cursor, least_extroverted_view)
print(low_EXT_by_country_df)

low_EXT_merged_country_df = merge_country_df_to_find_percentage(low_EXT_by_country_df, country_count_df)
print(low_EXT_merged_country_df)
print(low_EXT_merged_country_df.loc[[low_EXT_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

     question_id country  count
0              2      AD      3
1              2      AE    287
2              2      AF      6
3              2      AG      3
4              2      AI      1
..           ...     ...    ...
924           10      XK      3
925           10      YE      2
926           10      ZA    587
927           10      ZM     14
928           10      ZW     21

[929 rows x 3 columns]
     question_id country  count  total_respondents  percentage
0              2      AD      3                 30   10.000000
1              2      AE    287               4585    6.259542
2              2      AF      6                 90    6.666667
3              2      AG      3                 30   10.000000
4              2      AI      1                  5   20.000000
..           ...     ...    ...                ...         ...
924           10      XK      3                 20   15.000000
925           10      YE      2                 10   20.000000
926           10      ZA 

In [None]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# find responses between 4 and 5 to questions relating to high agreableness for each country
most_agreeable_ids = get_question_ids_by_trait_category(cursor, most_agreeable)
most_agreeable_view = "most_agreeable_view"
create_view_of_selected_questions(cursor, most_agreeable_view, most_agreeable_ids)
country_count_df = count_per_country_df_from_view(cursor, most_agreeable_view)

high_AGR_by_country_df = agreement_by_country(cursor, most_agreeable_view)
print(high_AGR_by_country_df)

high_AGR_merged_country_df = merge_country_df_to_find_percentage(high_AGR_by_country_df, country_count_df)
print(high_AGR_merged_country_df)
print(high_AGR_merged_country_df.loc[[high_AGR_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

     question_id country  count
0             22      AD      4
1             22      AE    576
2             22      AF     10
3             22      AG      3
4             22      AI      1
..           ...     ...    ...
965           30      XK      3
966           30      YE      2
967           30      ZA    653
968           30      ZM     20
969           30      ZW     18

[970 rows x 3 columns]
     question_id country  count  total_respondents  percentage
0             22      AD      4                 30   13.333333
1             22      AE    576               4585   12.562704
2             22      AF     10                 90   11.111111
3             22      AG      3                 30   10.000000
4             22      AI      1                  5   20.000000
..           ...     ...    ...                ...         ...
965           30      XK      3                 20   15.000000
966           30      YE      2                 10   20.000000
967           30      ZA 

In [None]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# find responses between 4 and 5 to questions relating to least agreableness for each country
least_agreeable_ids = get_question_ids_by_trait_category(cursor, least_agreeable)
least_agreeable_view = "least_agreeable_view"
create_view_of_selected_questions(cursor, least_agreeable_view, least_agreeable_ids)
country_count_df = count_per_country_df_from_view(cursor, least_agreeable_view)

low_AGR_by_country_df = agreement_by_country(cursor, least_agreeable_view)
print(low_AGR_by_country_df)

low_AGR_merged_country_df = merge_country_df_to_find_percentage(low_AGR_by_country_df, country_count_df)
print(low_AGR_merged_country_df)
print(low_AGR_merged_country_df.loc[[low_AGR_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

     question_id country  count
0             21      AD      1
1             21      AE    289
2             21      AF      9
3             21      AG      1
4             21      AL     45
..           ...     ...    ...
688           27      WS      2
689           27      XK      1
690           27      ZA    183
691           27      ZM      3
692           27      ZW      2

[693 rows x 3 columns]
     question_id country  count  total_respondents  percentage
0             21      AD      1                 24    4.166667
1             21      AE    289               3668    7.878953
2             21      AF      9                 72   12.500000
3             21      AG      1                 24    4.166667
4             21      AL     45                492    9.146341
..           ...     ...    ...                ...         ...
688           27      WS      2                  8   25.000000
689           27      XK      1                 16    6.250000
690           27      ZA 

In [None]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# find responses between 4 and 5 to questions relating to high openness for each country
most_open_ids = get_question_ids_by_trait_category(cursor, most_open)
most_open_view = "most_open_view"
create_view_of_selected_questions(cursor, most_open_view, most_open_ids)
country_count_df = count_per_country_df_from_view(cursor, most_open_view)

high_OPN_by_country_df = agreement_by_country(cursor, most_open_view)
print(high_OPN_by_country_df)

high_OPN_merged_country_df = merge_country_df_to_find_percentage(high_OPN_by_country_df, country_count_df)
print(high_OPN_merged_country_df)
print(high_OPN_merged_country_df.loc[[high_OPN_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

      question_id country  count
0              41      AD      3
1              41      AE    433
2              41      AF     12
3              41      AG      5
4              41      AI      1
...           ...     ...    ...
1346           50      XK      2
1347           50      YE      2
1348           50      ZA    788
1349           50      ZM     21
1350           50      ZW     24

[1351 rows x 3 columns]
      question_id country  count  total_respondents  percentage
0              41      AD      3                 42    7.142857
1              41      AE    433               6419    6.745599
2              41      AF     12                126    9.523810
3              41      AG      5                 42   11.904762
4              41      AI      1                  7   14.285714
...           ...     ...    ...                ...         ...
1346           50      XK      2                 28    7.142857
1347           50      YE      2                 14   14.285714
134

In [None]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# find responses between 4 and 5 to questions relating to low openness for each country
least_open_ids = get_question_ids_by_trait_category(cursor, least_open)
least_open_view = "least_open_view"
create_view_of_selected_questions(cursor, least_open_view, least_open_ids)
country_count_df = count_per_country_df_from_view(cursor, least_open_view)

low_OPN_by_country_df = agreement_by_country(cursor, least_open_view)
print(low_OPN_by_country_df)

low_OPN_merged_country_df = merge_country_df_to_find_percentage(low_OPN_by_country_df, country_count_df)
print(low_OPN_merged_country_df)
print(low_OPN_merged_country_df.loc[[low_OPN_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

     question_id country  count
0             42      DE    307
1             42      AU   1887
2             42      US  19205
3             42      PH   1093
4             42      TW     29
..           ...     ...    ...
465           46      ME      1
466           46      LC      1
467           46      AF      1
468           46      SZ      1
469           46      IQ      1

[470 rows x 3 columns]
     question_id country  count  total_respondents  percentage
0             42      DE    307              12456    2.464676
1             42      AU   1887              45093    4.184685
2             42      US  19205             491466    3.907697
3             42      PH   1093              17733    6.163650
4             42      TW     29                735    3.945578
..           ...     ...    ...                ...         ...
465           46      ME      1                120    0.833333
466           46      LC      1                  6   16.666667
467           46      AF 

In [None]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# find responses between 4 and 5 to questions relating to high conscientiousness for each country
most_conscientious_ids = get_question_ids_by_trait_category(cursor, most_conscientious)
most_conscientious_view = "most_conscientious_view"
create_view_of_selected_questions(cursor, most_conscientious_view, most_conscientious_ids)
country_count_df = count_per_country_df_from_view(cursor, most_conscientious_view)

high_CSN_by_country_df = agreement_by_country(cursor, most_conscientious_view)
print(high_CSN_by_country_df)

high_CSN_merged_country_df = merge_country_df_to_find_percentage(high_CSN_by_country_df, country_count_df)
print(high_CSN_merged_country_df)
print(high_CSN_merged_country_df.loc[[high_CSN_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

      question_id country  count
0              31      AD      2
1              31      AE    429
2              31      AF     10
3              31      AG      4
4              31      AL     56
...           ...     ...    ...
1120           40      XK      2
1121           40      YE      1
1122           40      ZA    695
1123           40      ZM     17
1124           40      ZW     24

[1125 rows x 3 columns]
      question_id country  count  total_respondents  percentage
0              31      AD      2                 36    5.555556
1              31      AE    429               5502    7.797165
2              31      AF     10                108    9.259259
3              31      AG      4                 36   11.111111
4              31      AL     56                738    7.588076
...           ...     ...    ...                ...         ...
1120           40      XK      2                 24    8.333333
1121           40      YE      1                 12    8.333333
112

In [None]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# find responses between 4 and 5 to questions relating to low conscientiousness for each country
least_conscientious_ids = get_question_ids_by_trait_category(cursor, least_conscientious)
least_conscientious_view = "least_conscientious_view"
create_view_of_selected_questions(cursor, least_conscientious_view, least_conscientious_ids)
country_count_df = count_per_country_df_from_view(cursor, least_conscientious_view)

low_CSN_by_country_df = agreement_by_country(cursor, least_conscientious_view)
print(low_CSN_by_country_df)

low_CSN_merged_country_df = merge_country_df_to_find_percentage(low_CSN_by_country_df, country_count_df)
print(low_CSN_merged_country_df)
print(low_CSN_merged_country_df.loc[[low_CSN_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

     question_id country  count
0             32      AD      3
1             32      AE    312
2             32      AF      6
3             32      AG      1
4             32      AL     51
..           ...     ...    ...
686           38      XK      1
687           38      YE      2
688           38      ZA    171
689           38      ZM      2
690           38      ZW      4

[691 rows x 3 columns]
     question_id country  count  total_respondents  percentage
0             32      AD      3                 24   12.500000
1             32      AE    312               3668    8.505998
2             32      AF      6                 72    8.333333
3             32      AG      1                 24    4.166667
4             32      AL     51                492   10.365854
..           ...     ...    ...                ...         ...
686           38      XK      1                 16    6.250000
687           38      YE      2                  8   25.000000
688           38      ZA 

In [None]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# find responses between 4 and 5 to questions relating to high neuroticism for each country
most_neurotic_ids = get_question_ids_by_trait_category(cursor, most_neurotic)
most_neurotic_view = "most_neurotic_view"
create_view_of_selected_questions(cursor, most_neurotic_view, most_neurotic_ids)
country_count_df = count_per_country_df_from_view(cursor, most_neurotic_view)

high_EST_by_country_df = agreement_by_country(cursor, most_neurotic_view)
print(high_EST_by_country_df)

high_EST_merged_country_df = merge_country_df_to_find_percentage(high_EST_by_country_df, country_count_df)
print(high_EST_merged_country_df)
print(high_EST_merged_country_df.loc[[high_EST_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

      question_id country  count
0              11      AD      1
1              11      AE    426
2              11      AF      8
3              11      AG      2
4              11      AI      1
...           ...     ...    ...
1465           20      XK      2
1466           20      YE      2
1467           20      ZA    330
1468           20      ZM      7
1469           20      ZW      9

[1470 rows x 3 columns]
      question_id country  count  total_respondents  percentage
0              11      AD      1                 48    2.083333
1              11      AE    426               7336    5.806979
2              11      AF      8                144    5.555556
3              11      AG      2                 48    4.166667
4              11      AI      1                  8   12.500000
...           ...     ...    ...                ...         ...
1465           20      XK      2                 32    6.250000
1466           20      YE      2                 16   12.500000
146

In [None]:
tracemalloc.start()
start_time = time.time()

conn = connect_to_mysql()
cursor = create_cursor(conn)
connect_to_db(cursor, db_name)

# find responses between 4 and 5 to questions relating to low neuroticism (most emotional stability) for each country
least_neurotic_ids = get_question_ids_by_trait_category(cursor, least_neurotic)
least_neurotic_view = "least_neurotic_view"
create_view_of_selected_questions(cursor, least_neurotic_view, least_neurotic_ids)
country_count_df = count_per_country_df_from_view(cursor, least_neurotic_view)

low_EST_by_country_df = agreement_by_country(cursor, least_neurotic_view)
print(low_EST_by_country_df)

low_EST_merged_country_df = merge_country_df_to_find_percentage(low_EST_by_country_df, country_count_df)
print(low_EST_merged_country_df)
print(low_EST_merged_country_df.loc[[low_EST_merged_country_df["percentage"].idxmax()]])

conn.commit()
cursor.close()
conn.close()

# Get memory usage details
current, peak = tracemalloc.get_traced_memory()
end_time = time.time()  # Stop execution timer

# Print memory stats
print(f"Current memory usage: {current / 1024 ** 2:.2f} MB")
print(f"Peak memory usage: {peak / 1024 ** 2:.2f} MB")
print(f"Execution time: {end_time - start_time:.2f} seconds")

tracemalloc.stop()

     question_id country  count
0             12      DE   1903
1             12      US  70730
2             12      EG     99
3             12      PH   2237
4             12      AU   6772
..           ...     ...    ...
359           14      SL      1
360           14      SO      1
361           14      XK      1
362           14      SZ      1
363           14      RW      1

[364 rows x 3 columns]
     question_id country  count  total_respondents  percentage
0             12      DE   1903               8304   22.916667
1             12      US  70730             327644   21.587455
2             12      EG     99                652   15.184049
3             12      PH   2237              11822   18.922348
4             12      AU   6772              30062   22.526778
..           ...     ...    ...                ...         ...
359           14      SL      1                  6   16.666667
360           14      SO      1                  8   12.500000
361           14      XK 

In [None]:
# assign a label (indicating their trait) to each user by calculating the difference to average scores for each trait
# find the most common combinations
  # if someone is extroverted, how likely are they to be agreeable
  # if someone is both extroverted, agreeable are they likely to also be x

# something like:
  # SELECT COUNT(is_agreeable) / COUNT(*) as percent agreeable
  # FROM user
  # WHERE is_extroverted = TRUE