# Data Analysis and Visualisation Examples
This notebook illustrates various visualisation techniques and libraries, providing their implementation.

## Import the Libraries

In [None]:
# for 2D, 3D, and animated visualisation
! pip install plotly

In [None]:
# for geo-mapping
! pip install folium

In [None]:
# for geo-operations
!pip install geopy

In [None]:
# for working with data structures
import pandas as pd
import numpy as np

In [None]:
# graph libraries for work with visualisation
import matplotlib.pyplot as plt
import seaborn as sbn
import folium
import plotly.express as px

In [None]:
import geopy
from geopy.geocoders import Nominatim

## Data Ingestion and Wrangling

### Read Data

In [None]:
# read the data about meteorites landing on Earth
df = pd.read_csv("../../Data/Meteorite_Landings_20240212.csv", sep = ',')

In [None]:
df.shape

In [None]:
# get idea of the look
df.sample(5)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
# renaming columns for convenience
df.rename({'mass (g)':'mass', 'reclat':'lat', 'reclong':'long'}, axis=1, inplace=True)

### Clean the Data

In [None]:
# to check null values in data
df.isnull().sum()

Lots of missing values in the records, in mass and year, and especially missing coordinates of the location on Earth.

__Replacing missing mass values with the avarage (mean)__

In [None]:
# mass
mass_mean = df['mass'].mean()
df['mass'] = df['mass'].fillna(int(mass_mean))

__Deleting rows with unknown year__

In [None]:
# year
df.dropna(subset=['year'], inplace=True)

In [None]:
df.shape

In [None]:
df.isna().sum()

Keep the rest of rows for non-geo related analisys.

In [None]:
df.head()

### Fix the Wrong Types

In [None]:
# change the year type to int
df['year'] = df['year'].astype(int)

In [None]:
# change the format of the float numbers to avoid presenting then in scientific notation, e.g. e+04
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df.info()

### Check for Wrong Categorical Values

In [None]:
types = df["nametype"].unique()
types, len(types)

In [None]:
classes = df["recclass"].unique()
classes, len(classes)

### Explore the Attributes

#### Categorical Data

In [None]:
df['nametype'].describe()

In [None]:
type(df['nametype'])

A pandas  and only the number of rows is returned.

In [None]:
# Series is one-dimensional
df['nametype'].shape

In [None]:
# explore nametype
nametype_group = df.groupby(by = 'nametype').size()
nametype_group

In [None]:
type(nametype_group)

In [None]:
# In Series the values are labeled by index
nametype_group.index

In [None]:
nametype_group.values

In [None]:
df[(df['nametype'] == 'Relict')]

In [None]:
# Bar Plot
plt.subplots(figsize=(5, 5))
x = nametype_group.index
y = nametype_group.values
plt.bar(x, y, color='skyblue')
plt.xlabel('nametypes')
plt.ylabel('number of occurance')
plt.title('Categories')
plt.show()

In [None]:
# recclass
recclass_group = df.groupby(by = 'recclass').size()
recclass_group

In [None]:
recclass_group.sort_values(ascending=False, inplace = True)

In [None]:
recclass_group.index

In [None]:
# Bar Plot
plt.subplots(figsize=(15, 5))
x = recclass_group.index
y = recclass_group.values
plt.bar(x, y, color='skyblue')
plt.xlabel('recclasses')
plt.ylabel('number of occurance')
plt.title('Types')
plt.show()

#### Scatter Plots

In [None]:
# with Seaborn
sbn.pairplot(df, x_vars=['year', 'nametype'], y_vars='mass', height=3)

In [None]:
# with Plotly
fig = px.scatter(df, x="year", y="recclass", size="mass", color="recclass",
           hover_name="recclass", log_x=True, size_max=60)
fig.show()

In [None]:
df.sort_values(by=['year'], ascending=True, inplace=True)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
x = df['year']
y = df["recclass"]
fig = px.scatter(df, x, y, animation_frame='year', animation_group='recclass',
           size='mass', color="recclass",  range_x=[1600, 2000])
fig.show()

In [None]:
fig = px.scatter(df, x="year", y="nametype", size="mass", color="nametype",
           hover_name="nametype", log_x=True, size_max=30)
fig.show()

In [None]:
# plot the observations to discover outliers
df.plot.scatter(x='year', y='mass', c='year', colormap='viridis')

In [None]:
# pie chart
fig = px.pie(df, 'year', values = "mass")
fig.show()

In [None]:
fig = px.line_3d(x=df['recclass'], y=df['year'], z=df['mass']) 
fig.show()

In [None]:
# 3D scatter plot
fig = px.scatter_3d(df, x="year", y="mass", z='recclass', color="year", size='mass', size_max=40, opacity=0.8)
fig.show()

In [None]:
# bar chart
fig = px.bar(df, x="recclass", y="mass", color = "recclass") 
fig.show()

## Feature Engineering

### Encode "nametype" with One-Hot Encoding

In [None]:
# assign a type category for use with qualitative data
df["nametype"] = df["nametype"].astype('category')

In [None]:
# apply one-hot encoding to the name types
onehot = pd.get_dummies(df['nametype'], dtype=float)
onehot.sample(5)

In [None]:
# replace the original nametype with one-hot encoded
df = pd.get_dummies(df, columns = ['nametype'], dtype=float)
df.sample(5)

In [None]:
df.columns

In [None]:
# valid only
dfv = df[df['nametype_Valid'] == True]
dfv.sample(3)

In [None]:
# relic only
dfr = df[df['nametype_Relict'] == True]
dfr.sample(3)

### Encode "recclass" with Label Encoding

In [None]:
df["recclass"].unique()

In [None]:
df['recclass'].value_counts()

In [None]:
# assign a type category for use with qualitative data
df["recclass"] = df["recclass"].astype('category')

In [None]:
df["recclass"] = df["recclass"].cat.codes

In [None]:
df.info()

In [None]:
df.sample(5)

## Explore the Data by Statistics

In [None]:
df.describe()

In [None]:
numdf = df.select_dtypes(include='number')
numdf

In [None]:
# drop id
numdf.drop(['id'], axis=1, inplace = True)

In [None]:
numdf.head()

In [None]:
numdf.columns

### Explore and Remove Outliers

In [None]:
numdf.hist(bins = 20)

In [None]:
numdf.describe()

In [None]:
# plot outliers
plt.scatter(x=numdf['year'], y=numdf['mass'], alpha=0.5)
plt.show()

In [None]:
numdf.boxplot()

In [None]:
# outliers by inter-quartile range (IQR)
from typing import List
def remove_outliers(df, data: List):
    q1 = data.quantile(.25)
    q3 = data.quantile(.75)
    IQR = q3 - q1
    # values smaller than 1.5 IQR below q1 and bigger that 1.5 IQR over q3 
    outliers = df[(data < (q1 - 1.5 * IQR)) | (data > (q3 + 1.5 * IQR))]
    df = df.drop(outliers.index, inplace=True)
    return df

In [None]:
remove_outliers(numdf, numdf['mass'])
numdf.shape

In [None]:
remove_outliers(numdf, numdf['year'])
numdf.shape

In [None]:
numdf.describe()

In [None]:
# plot all observations to discover outliers
numdf.plot.scatter(x='year', y='mass', c='mass', colormap='viridis')

## Animation Example

In [None]:
# Animation example with another dataset
df1 = px.data.gapminder()

In [None]:
df.sample(10)

In [None]:
px.scatter(df1, x="gdpPercap", y="lifeExp", animation_frame="year", animation_group="country",
           size="pop", color="continent", hover_name="country",
           log_x=True, size_max=55, range_x=[100,100000], range_y=[25,90])

### Visualise Geodata with Folium

Create a version of the dataset, cleaned from the rows with missing geo-coordinates. This copy will be used for geo-visualisation analysis.

In [None]:
dfl = df.copy()

In [None]:
dfl.sample(5)

In [None]:
dfl.isnull().sum()

In [None]:
dfl = dfl.dropna(subset=['reclat', 'reclong', 'GeoLocation'])

In [None]:
dfl.shape

In [None]:
dfl.isna().sum()

In [None]:
subset = ['reclat','reclong']
df.dropna(subset = subset, inplace=True)

In [None]:
df.shape

In [None]:
df.sample(10)

In [None]:
# build a map
fmap = folium.Map(location = [55.6819, 12.5627], zoom_start = 4)
center = folium.Marker([55.6819, 12.5627], popup="Cphbusiness-Lyngby", tooltip="new building").add_to(fmap)
fmap

In [None]:
# add one more layer
folium.TileLayer('openstreetmap').add_to(fmap)

In [None]:
fmap.add_child(folium.LatLngPopup())

In [None]:
# add countries
political_countries_url = ("http://geojson.xyz/naturalearth-3.3.0/ne_50m_admin_0_countries.geojson")
folium.GeoJson(political_countries_url).add_to(fmap)

In [None]:
fmap

In [None]:
df.sample()

In [None]:
# add markers for the meteorites in a group
fmap = folium.FeatureGroup(name="icon collection", show=False).add_to(fmap)

In [None]:
loc

In [None]:

for index, row in df.iloc[:10].iterrows():
    marker = ''
    loc = row['lat'],row['long'] 
    # loc = row['GeoLocation']
    pop = str(int(row['year']))
    marker = folium.Marker(location=loc, popup=pop, 
                           tooltip = "Click me!", 
                           icon = folium.Icon(color="green", icon='flag')).add_to(fmap)

In [None]:
fmap

In [None]:
folium.LayerControl().add_to(fmap)

In [None]:
fmap

In [None]:
fmap.save("../data/meteorit.html")

In [None]:
GeoLocation = [55.6819, 12.5627]

m = folium.Map(location=GeoLocation,
               zoom_start=15,
               tiles='openstreetmap')

folium.Marker(GeoLocation,
              popup='<i>Data</i>', 
              tooltip='Click me!').add_to(m)

In [None]:
m

## Interactive Visualisation

In [None]:
!pip install pygwalker

In [None]:
import pygwalker as pyg

In [None]:
walker = pyg.walk(df1, kanaries_api_key = "<your key here")

## See More
- https://plotly.com
- https://darigak.medium.com/your-guide-to-folium-markers-b9324fc7d65d
- https://deparkes.co.uk/2016/06/10/folium-map-tiles/

### Try Using Other Visual Libraries

In [None]:
import networkx as nx

In [None]:
df2 = pd.read_csv('../data/penguins.csv')

In [None]:
G = nx.Graph()

In [None]:
G.add_nodes_from(df2['island']) 

In [None]:
G.add_nodes_from(df2['species']) 

In [None]:
edges = [(row['island'], row['species']) for index, row in df2.iterrows()]
G.add_edges_from(edges)

In [None]:
# G.add_edges_from([(1,2), (1,3), (4,5)]) 

In [None]:
list(G)

In [None]:
nx.draw(G)
plt.show()

In [None]:
# Draw the graph
pos = nx.spring_layout(G) # Define the layout for node positioning
nx.draw(G, pos, with_labels=True, node_size=600, node_color='skyblue', font_size=8, font_color='black')

# Display the graph
plt.show()