In [1]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Parameters
num_particles = 30
num_time_steps = 1000
time = np.arange(num_time_steps)
dt = 1.0  # Time step size

# Generate particle IDs
particle_ids = np.arange(1, num_particles + 1)

# Generate nonlinear deterministic trajectories
# Here, I'm just creating some arbitrary nonlinear functions for position and velocity
def nonlinear_position(t):
    return 10 + (np.sin(t) + 0.5 * np.cos(2*t))

def nonlinear_velocity(t):
    return 10 + (np.cos(t) - 0.5 * np.sin(2*t))

# Generate positions and velocities with nonlinear deterministic dynamics
positions = np.array([nonlinear_position(time) for _ in range(num_particles)])
velocities = np.array([nonlinear_velocity(time) for _ in range(num_particles)])
temp = np.ones(num_particles * num_time_steps)

# Create dataframe
data = {
    'Time': np.repeat(time, num_particles),
    'Particle ID': np.tile(particle_ids, num_time_steps),
    'X Position': positions.flatten(),
    'Y Position': positions.flatten() * 0.8,  # Just for illustration, assuming a different trajectory in y-direction
    'Z Position': positions.flatten() * 1.2,  # Just for illustration, assuming a different trajectory in z-direction
    'X Velocity': velocities.flatten(),
    'Y Velocity': velocities.flatten() * 0.7,  # Just for illustration, assuming a different velocity in y-direction
    'Z Velocity': velocities.flatten() * 1.3,  # Just for illustration, assuming a different velocity in z-direction
    'T Velocity': temp
}

df = pd.DataFrame(data)

# Introduce causal connections
# Let's say the X Velocity is proportional to the square of X Position
df['X Velocity'] += df['X Position']**2 * 0.002

# Let's say Z Position is dependent on X Position and Y Velocity
df['Z Position'] += df['X Position'] * df['Y Velocity'] * 0.01

# T velocity is the total sum of all velocities
df['T Velocity'] *= (df['X Velocity'] + df['Y Velocity'] + df['Z Velocity'])

# Add nonlinear, non-Gaussian error term
# Let's say the error term is proportional to the square of the position
error_term = np.random.chisquare(3, size=len(df))**2
df['X Position'] += error_term * 0.01  # Scale the error for illustration
df['Y Position'] += error_term * 0.01
df['Z Position'] += error_term * 0.01
df['X Velocity'] += error_term * 0.005
df['Y Velocity'] += error_term * 0.005
df['Z Velocity'] += error_term * 0.005

# Add a column soley dependent on others
df['D Particles'] = df['Z Position'] * df['T Velocity'] * 1000

# Display the dataframe
print(df.head())


   Time  Particle ID  X Position  Y Position  Z Position  X Velocity  \
0     0            1   10.628110    8.528110   13.536610   11.284555   
1     0            2   10.675308    8.548629   13.552701   10.332747   
2     0            3   10.617190    8.500695   13.471662   10.203589   
3     0            4   10.655921    8.531679   13.460429    9.392693   
4     0            5    9.759891    7.925802   12.162198    9.314593   

   Y Velocity  Z Velocity  T Velocity    D Particles  
0    7.764055   14.364055   33.220500  449692.953985  
1    7.080913   13.132305   30.483099  413128.332926  
2    6.990935   12.968288   30.110741  405641.728118  
3    6.422158   11.911988   27.674766  372514.207762  
4    6.490896   11.801902   26.723226  325013.162048  


In [2]:
data = df[df["Time"] == 0].copy()
sep = pd.DataFrame([[np.nan]*len(data.columns)])
sep.columns = data.columns
data = pd.concat([data, sep])

for t in range(1, df["Time"].max() + 1):
    curr = df[df["Time"] == t]
    data = pd.concat([data, curr, sep])
data.drop(['Time', 'Particle ID'], axis=1, inplace=True)
data.reset_index(inplace=True, drop=True)
data.head()

Unnamed: 0,X Position,Y Position,Z Position,X Velocity,Y Velocity,Z Velocity,T Velocity,D Particles
0,10.62811,8.52811,13.53661,11.284555,7.764055,14.364055,33.2205,449692.953985
1,10.675308,8.548629,13.552701,10.332747,7.080913,13.132305,30.483099,413128.332926
2,10.61719,8.500695,13.471662,10.203589,6.990935,12.968288,30.110741,405641.728118
3,10.655921,8.531679,13.460429,9.392693,6.422158,11.911988,27.674766,372514.207762
4,9.759891,7.925802,12.162198,9.314593,6.490896,11.801902,26.723226,325013.162048


In [3]:
data.to_csv("simulated_updated.csv")

In [4]:
# data.set_index('D Particles', inplace=True)
data.to_csv("simulated_updated_no_index.csv")

In [13]:
(data < 0).sum().sum()

0

In [85]:
df = pd.read_csv("aerosol_cloud_data.csv", index_col=0)
df.head()

Unnamed: 0_level_0,P,SST,LTS,FTH,WS,DIV,CF
Nd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
66.2,0.0783,292.0,25.8,0.00016,10.4,-1.71e-05,0.902
201.0,0.00799,295.0,23.2,0.00024,8.24,-1.12e-06,0.934
24.7,0.0131,293.0,23.5,0.000224,12.0,1.19e-06,0.901
160.0,0.00187,295.0,17.6,0.00789,2.55,-6.61e-07,0.576
12.2,0.00381,291.0,21.2,0.00032,10.3,1.11e-05,0.646


In [86]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame
# Create a list to store the smaller DataFrames
small_dfs = []
# Initialize a list to store the current subset of rows
subset = []

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    # Check if the row contains only NaN values
    if row.isnull().all():
        # If the subset list is not empty, add it to the list
        if subset:
            # Convert subset list to NumPy array
            subset_array = np.array(subset)
            # Get the maximum number of columns
            print(subset_array.shape)
            max_rows = subset_array.shape[1]
            # Pad or truncate rows to match the maximum number of columns
            subset_array = np.pad(subset_array, ((0, 0), (0, max_columns - subset_array.shape[1])), mode='constant')
            # Append the subset array to the list
            small_dfs.append(subset_array)
            # Reset the subset list
            subset = []
    else:
        # Append the current row to the subset list
        subset.append(row.values)

# Add the last subset list to the list if it's not empty
if subset:
    # Convert subset list to NumPy array
    subset_array = np.array(subset)
    # Get the maximum number of columns
    max_columns = max(subset_array.shape[1] for subset_array in subset)
    # Pad or truncate rows to match the maximum number of columns
    subset_array = np.pad(subset_array, ((0, 0), (0, max_columns - subset_array.shape[1])), mode='constant')
    # Append the subset array to the list
    small_dfs.append(subset_array)

# Now small_dfs contains a list of 2D NumPy arrays
# Stack them along the third dimension to create a 3D NumPy array
result = np.stack(small_dfs, axis=0)


(76, 7)
(67, 7)
(92, 7)
(51, 7)
(94, 7)
(49, 7)
(110, 7)
(33, 7)
(112, 7)
(31, 7)
(128, 7)
(15, 7)
(91, 7)
(38, 7)
(13, 7)
(26, 7)
(117, 7)
(2, 7)
(106, 7)
(34, 7)
(4, 7)
(39, 7)
(99, 7)
(20, 7)
(106, 7)
(16, 7)
(22, 7)
(39, 7)
(81, 7)
(38, 7)
(105, 7)
(1, 7)
(38, 7)
(39, 7)
(63, 7)
(56, 7)
(87, 7)
(58, 7)
(39, 7)
(45, 7)
(74, 7)
(69, 7)
(76, 7)
(39, 7)
(27, 7)
(92, 7)
(51, 7)
(94, 7)
(39, 7)
(9, 7)
(110, 7)
(33, 7)
(112, 7)
(31, 7)
(8, 7)
(119, 7)
(15, 7)
(130, 7)
(13, 7)
(26, 7)
(117, 7)
(2, 7)
(141, 7)
(4, 7)
(39, 7)
(99, 7)
(20, 7)
(123, 7)
(22, 7)
(39, 7)
(81, 7)
(38, 7)
(105, 7)
(40, 7)
(39, 7)
(63, 7)
(56, 7)
(87, 7)
(58, 7)
(39, 7)
(45, 7)
(74, 7)
(69, 7)
(76, 7)
(39, 7)
(27, 7)
(92, 7)
(51, 7)
(94, 7)
(22, 7)
(16, 7)
(9, 7)
(110, 7)
(33, 7)
(112, 7)
(22, 7)
(8, 7)
(8, 7)
(119, 7)
(15, 7)
(130, 7)
(13, 7)
(9, 7)
(16, 7)
(57, 7)
(59, 7)
(2, 7)
(141, 7)
(4, 7)
(22, 7)
(16, 7)
(57, 7)
(41, 7)
(20, 7)
(123, 7)
(22, 7)
(22, 7)
(16, 7)
(57, 7)
(23, 7)
(38, 7)
(105, 7)
(40, 7)
(22, 7)

ValueError: all input arrays must have the same shape

In [87]:
max_rows = 0
for i in small_dfs:
    if i.shape[0] > max_rows:
        max_rows = i.shape[0]
max_rows

141

In [88]:
fin = []
for i in small_dfs:
    out = i
    if i.shape[0] < max_rows:
        out = np.vstack([i, np.zeros((max_rows - i.shape[0], i.shape[1]))])
    fin.append(out)
fin

[array([[ 7.83e-02,  2.92e+02,  2.58e+01, ..., -1.71e-05,  9.02e-01,
          0.00e+00],
        [ 7.99e-03,  2.95e+02,  2.32e+01, ..., -1.12e-06,  9.34e-01,
          0.00e+00],
        [ 1.31e-02,  2.93e+02,  2.35e+01, ...,  1.19e-06,  9.01e-01,
          0.00e+00],
        ...,
        [ 0.00e+00,  0.00e+00,  0.00e+00, ...,  0.00e+00,  0.00e+00,
          0.00e+00],
        [ 0.00e+00,  0.00e+00,  0.00e+00, ...,  0.00e+00,  0.00e+00,
          0.00e+00],
        [ 0.00e+00,  0.00e+00,  0.00e+00, ...,  0.00e+00,  0.00e+00,
          0.00e+00]]),
 array([[ 1.05e-02,  2.88e+02,  2.51e+01, ..., -1.24e-05,  9.58e-01,
          0.00e+00],
        [ 2.17e-02,  2.96e+02,  2.26e+01, ...,  3.82e-06,  9.97e-01,
          0.00e+00],
        [ 5.20e-02,  2.96e+02,  2.26e+01, ..., -5.11e-06,  9.99e-01,
          0.00e+00],
        ...,
        [ 0.00e+00,  0.00e+00,  0.00e+00, ...,  0.00e+00,  0.00e+00,
          0.00e+00],
        [ 0.00e+00,  0.00e+00,  0.00e+00, ...,  0.00e+00,  0.00e+00,
   

In [95]:
for i, d in enumerate(fin):
    d = pd.DataFrame(fin[:, i, :])
    d.columns = df.reset_index().columns
    d.set_index('Nd', inplace=True)
    d.to_csv("./cloud/time_series_{}.csv".format(i))

IndexError: index 141 is out of bounds for axis 1 with size 141

In [93]:
fin.shape

(2917, 141, 8)

In [94]:
pd.DataFrame(fin[:, 30, :])

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.01090,295.0,17.6,0.004620,1.57,2.120000e-06,0.582,0.0
1,0.09350,290.0,24.8,0.000130,11.90,-1.110000e-05,1.000,0.0
2,0.00186,295.0,17.2,0.006160,2.15,5.030000e-06,0.528,0.0
3,0.03160,287.0,25.7,0.000688,8.49,-8.000000e-06,0.869,0.0
4,0.05820,293.0,25.9,0.000205,11.10,1.700000e-06,0.666,0.0
...,...,...,...,...,...,...,...,...
2912,0.00000,0.0,0.0,0.000000,0.00,0.000000e+00,0.000,0.0
2913,0.00925,295.0,17.1,0.000663,2.81,-5.500000e-07,0.860,0.0
2914,0.00000,0.0,0.0,0.000000,0.00,0.000000e+00,0.000,0.0
2915,0.00000,0.0,0.0,0.000000,0.00,0.000000e+00,0.000,0.0


In [11]:
len(small_dfs)

2917