# Python - Creating Own Data
- create data inside python - list, dictionary, distribution etc
- upload data from laptop - csv, excel , etc
- import data from web - git, url etc

In [1]:
import numpy as np
import pandas as pd
import random

## Step-1 : Setup
- Create data using various Python structures

In [18]:
# Rollnos - list
rollno = list(range(1, 21))   # 20 students
rollno

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [19]:
# Names - tuple
sname = (     "Alice", "Bob", "Charlie", "Diana", "Ethan",  "Fiona", "George", "Hannah", "Ian", "Jasmine",
    "Kevin", "Lily", "Mike", "Nina", "Oscar",  "Priya", "Quinn", "Raj", "Sophia", "Tom")
print(list(sname))

['Alice', 'Bob', 'Charlie', 'Diana', 'Ethan', 'Fiona', 'George', 'Hannah', 'Ian', 'Jasmine', 'Kevin', 'Lily', 'Mike', 'Nina', 'Oscar', 'Priya', 'Quinn', 'Raj', 'Sophia', 'Tom']


In [20]:
# Gender - set (unique)
genders_set = {"Male", "Female"}
gender = [random.choice(list(genders_set)) for _ in rollnos]
print(list(gender))

['Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Female', 'Female', 'Male']


In [21]:
# Course - dictionary : key value pair
courses_dict = {1: "CS", 2: "IT", 3: "AI", 4: "DS"}  # mapping
course = [courses_dict[random.randint(1, 4)] for _ in rollnos]
print(list(course))

['DS', 'AI', 'DS', 'DS', 'AI', 'DS', 'IT', 'IT', 'AI', 'DS', 'AI', 'DS', 'IT', 'AI', 'AI', 'AI', 'DS', 'IT', 'DS', 'CS']


In [26]:
# Scores - random normal
# Normally distributed around mean=70, std=10, clipped to 40–100
score = np.clip(np.random.normal(70, 10, len(rollnos)), 40, 100).round(1)
score

array([53.2, 69.3, 78.1, 64.7, 65.7, 67.4, 51.7, 70.5, 73.9, 54. , 84.6,
       57.7, 72.2, 67. , 70.8, 66.8, 54.9, 62.7, 64.5, 64.3])

In [27]:
# Age - random, integer
age = np.random.randint(18, 25, size=len(rollnos))
age

array([18, 21, 23, 22, 20, 23, 23, 21, 20, 22, 18, 18, 23, 19, 18, 21, 19,
       21, 19, 23])

## Step - 2 : Data Frame
- Join Columns to Data Frame
- 20 Students Data

In [28]:
df = pd.DataFrame({
    "rollno": rollno,
    "sname": sname,
    "gender": gender,
    "course": course,
    "score": score,
    "age": age
})
df

Unnamed: 0,rollno,sname,gender,course,score,age
0,1,Alice,Male,DS,53.2,18
1,2,Bob,Female,AI,69.3,21
2,3,Charlie,Male,DS,78.1,23
3,4,Diana,Female,DS,64.7,22
4,5,Ethan,Female,AI,65.7,20
5,6,Fiona,Male,DS,67.4,23
6,7,George,Female,IT,51.7,23
7,8,Hannah,Male,IT,70.5,21
8,9,Ian,Female,AI,73.9,20
9,10,Jasmine,Male,DS,54.0,22


In [None]:
## Larget DF 
- install faker for snames : pip install faker

In [30]:
#pip install faker

In [38]:
import pandas as pd
import numpy as np
import random
from faker import Faker

In [40]:
# n - no of students
n=1000
fake = Faker()

In [44]:
print(tuple(fake.name() for _ in rollno[0:5]))

('Brenda Jackson', 'Danielle Barker', 'Lisa Villegas', 'Charles Jones', 'Timothy Graves')


In [45]:
rollno = list(range(1, n))
sname = tuple(fake.name() for _ in rollno)
genders_set = {"Male", "Female"}
gender = [random.choice(list(genders_set)) for _ in rollno]
courses_dict = {1: "CS", 2: "IT", 3: "AI", 4: "DS"}
course = [courses_dict[random.randint(1, 4)] for _ in rollno]
score = np.clip(np.random.normal(70, 10, len(rollno)), 40, 100).round(1)
age = np.random.randint(18, 25, size=len(rollno))

In [46]:
df2 = pd.DataFrame({
    "rollno": rollno,
    "sname": sname,
    "gender": gender,
    "course": course,
    "score": score,
    "age": age
})
df2.shape

(999, 6)

In [47]:
df2.head()

Unnamed: 0,rollno,sname,gender,course,score,age
0,1,Marcus Roberts,Male,CS,71.5,19
1,2,Aaron Gonzalez,Female,IT,81.7,22
2,3,Carl Schmidt,Female,CS,57.0,22
3,4,Stacey Hart DVM,Male,DS,77.3,21
4,5,Harry Simon,Male,CS,57.2,23


## Data from CSV
### file in laptop
- download from : https://catalog.data.gov/dataset/real-estate-sales-2001-2018  (125 MB)
    - listing of all real estate sales with a sales price of $2,000 or greater that occur between October 1 and September 30 of each year. For each sale record, the file includes: town, property address, date of sale, property type (residential, apartment, commercial, industrial or vacant land), sales price, and property assessment. 
- IMF : https://data.imf.org/en/datasets/IMF.STA:QGDP_WCA  (187 KB)
    - This dataset provides a snapshot of the economic activity of the world and selected country aggregates on a quarterly basis. It offers quarterly measurements of Gross Domestic Product (GDP). Quarterly estimates are disseminated on a seasonally adjusted basis.
- Country and capitals; https://ec.europa.eu/eurostat/statistics-explained/images/9/9f/Country_Codes_and_Names.xlsx   (XL)

In [63]:
f1='Real_Estate_Sales_2001-2023_GL.csv'
f2='dataset_2025-08-16T11_40_29.251665198Z_DEFAULT_INTEGRATION_IMF.STA_QGDP_WCA_3.0.0.csv'
f3 ='Country_Codes_and_Names.xlsx'
FOLDER ='/Users/du/dup/analytics/data/'
file1 = FOLDER + f1
file2 = FOLDER + f2
file3 = FOLDER + f3

print(file1,'\n', file2, '\n', file3)

/Users/du/dup/analytics/data/Real_Estate_Sales_2001-2023_GL.csv 
 /Users/du/dup/analytics/data/dataset_2025-08-16T11_40_29.251665198Z_DEFAULT_INTEGRATION_IMF.STA_QGDP_WCA_3.0.0.csv 
 /Users/du/dup/analytics/data/Country_Codes_and_Names.xlsx


In [64]:
df_f1 = pd.read_csv(file1)
df_f1.shape

  df_f1 = pd.read_csv(file1)


(1141722, 14)

In [65]:
df_f1.head(2)

Unnamed: 0,Serial Number,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Non Use Code,Assessor Remarks,OPM remarks,Location
0,2020177,2020,04/14/2021,Ansonia,323 BEAVER ST,133000.0,248400.0,0.5354,Residential,Single Family,,,,POINT (-73.06822 41.35014)
1,2020225,2020,05/26/2021,Ansonia,152 JACKSON ST,110500.0,239900.0,0.4606,Residential,Three Family,,,,


In [66]:
df_f2 = pd.read_csv(file2)
df_f2.shape

(60, 98)

In [67]:
df_f2.head(2)

Unnamed: 0,DATASET,SERIES_CODE,OBS_MEASURE,COUNTRY,INDICATOR,TYPE_OF_TRANSFORMATION,FREQUENCY,SCALE,PRECISION,DECIMALS_DISPLAYED,...,2022-Q4,2023-Q1,2023-Q2,2023-Q3,2023-Q4,2024-Q1,2024-Q2,2024-Q3,2024-Q4,2025-Q1
0,IMF.STA:QGDP_WCA(3.0.0),G001.B1GQ_S1.POP_PCH_PT.Q,OBS_VALUE,World,"Gross domestic product (GDP), Total economy","Period-over-preceding period percent change, P...",Quarterly,,,,...,0.970324,0.892833,0.464511,0.962598,0.706613,0.777499,0.246007,0.149742,0.550311,1.879711
1,IMF.STA:QGDP_WCA(3.0.0),U142.B1GQ_S1_Q.POP_PCH_PT.Q,OBS_VALUE,Asia,"Gross domestic product (GDP), Total economy, C...","Period-over-preceding period percent change, P...",Quarterly,,,,...,0.678801,1.386797,1.416312,1.064424,0.900656,1.162146,0.926886,1.132581,1.364676,1.141001


In [72]:
df_f3 = pd.read_excel(file3, skiprows=[1])
# pd.read_excel("students.xlsx", header=1)  # if headers are on 2nd row
df_f3.head(5)

Unnamed: 0,AREA,CODE,COUNTRY NAME
0,European Union (EU),BE,Belgium
1,European Union (EU),BG,Bulgaria
2,European Union (EU),CZ,Czech Republic
3,European Union (EU),DK,Denmark
4,European Union (EU),DE,Germany (including former GDR from 1991)


In [76]:
# data from Online Resources, downloadable directly
f4 = 'https://raw.githubusercontent.com/DUanalytics/datasets/refs/heads/master/csv/global_world_indicators_2000.csv'
file4 = f4

In [77]:
df_f4 = pd.read_csv(file4)
df_f4.shape

(207, 27)

In [79]:
df_f4.head(2)

Unnamed: 0,Birth Rate,Business Tax Rate,CO2 Emissions,Ease of Business (clusters),Country/Region,Days to Start Business,Ease of Business,Energy Usage,GDP,Health Exp % GDP,...,Mobile Phone Usage,Population 0-14,Population 15-64,Population 65+,Population Total,Population Urban,Region,Tourism Inbound,Tourism Outbound,Year
0,0.02,,87931.0,Low,Algeria,,,26998.0,54790060000.0,0.035,...,0.003,0.342,0.619,0.039,31719449,0.599,Africa,102000000.0,193000000.0,12/1/2000
1,0.05,,9542.0,Low,Angola,,,7499.0,9129595000.0,0.034,...,0.002,0.476,0.499,0.025,13924930,0.324,Africa,34000000.0,146000000.0,12/1/2000


In [82]:
#pip install xlrd


Note: you may need to restart the kernel to use updated packages.


In [94]:
f5A= 'https://my.surrey.ac.uk/sites/default/files/2024-12/UOS_RIGO_template_011%20Sample%20tracking%20log%2022Aug23.xlsx'

In [95]:
file5A = f5A
df_f5A = pd.read_excel(file5A)
#df_f5B = pd.read_excel(file5B, engine='xlrd')
df_f5A.shape

(9, 22)

In [96]:
df_f5A.head()

Unnamed: 0,PI:,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,Study Title:,,Study ID:,ABC,,,,,,,...,,,,,,,,,,
1,NHS REC or HTA Licence,,Consent forms location:,,,,,,,,...,,,,,,,,,,
2,Ethics Application no:,,Expiry of ethics:,,,,,,,,...,,,,,,,,,,
3,Sample Type,W = Whole Blood P=Plasma S=Serum C=PBMC T=...,Sample Storage,"-80, -20, RT, LN2, Other",,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


## Wiki table
- https://en.wikipedia.org/wiki/List_of_cities_by_GDP

In [104]:
wiki1 = "https://en.wikipedia.org/wiki/List_of_cities_by_GDP"
# read all tables on the page
tables = pd.read_html(wiki1)
print(f"Found {len(tables)} tables")

Found 6 tables


In [108]:
# Identify the correct table
# Often the first is the main list — let's preview the first few tables
for idx, table in enumerate(tables[:3]):
    print(f"\nTable {idx} columns: {table.columns.tolist()}\n")


Table 0 columns: [0, 1]


Table 1 columns: [0, 1]


Table 2 columns: ['City proper/metropolitan area', 'Country/region', 'GDP  (billion US$)', 'Population']



In [110]:
df_wk1 = tables[2]
df_wk1.head()

Unnamed: 0,City proper/metropolitan area,Country/region,GDP (billion US$),Population
0,A Coruña metropolitan area,Spain,32.344 (2021)[5],"1,121,758 (2021)[6]"
1,Aachen,Germany,27.092 (2021)[5],"556,631 (2021)[6]"
2,Aalborg,Denmark,30.46 (2022)[5],"591,740 (2022)[6]"
3,Aarhus,Denmark,53.383 (2022)[5],"913,861 (2022)[6]"
4,"Abbotsford, British Columbia",Canada,7.435 (2021)[7],"205,834 (2021)[8]"


### There are different ways to import table in to Python
