In [1]:
import pandas as pd

# Load the dataset
df = pd.read_parquet('/var/data/lvdthieu/java-data-final-v2.parquet')

# Display the first few rows
print(df.head())


        proj_name                                      relative_path  \
0  ainilili_ratel  ratel/landlords-common/src/main/java/org/nico/...   
1  ainilili_ratel  ratel/landlords-common/src/main/java/org/nico/...   
2  ainilili_ratel  ratel/landlords-common/src/main/java/org/nico/...   
3  ainilili_ratel  ratel/landlords-common/src/main/java/org/nico/...   
4  ainilili_ratel  ratel/landlords-common/src/main/java/org/nico/...   

                  class_name         func_name  \
0    EasyRobotDecisionMakers   howToPlayPokers   
1  MediumRobotDecisionMakers      serialPokers   
2        RobotDecisionMakers  getLandlordScore   
3                    ByteKit           indexOf   
4                   ByteLink           toArray   

                                        masked_class  \
0  class EasyRobotDecisionMakers extends Abstract...   
1  class MediumRobotDecisionMakers extends Abstra...   
2  class RobotDecisionMakers {\n\t\n\tprivate sta...   
3  class ByteKit {\n\n\t/**\n\t * Target b

In [2]:
# Basic information about the dataset
print(df.info())

# Statistical summary of numerical columns
print(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25848 entries, 0 to 25847
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   proj_name              25848 non-null  object
 1   relative_path          25848 non-null  object
 2   class_name             25848 non-null  object
 3   func_name              25848 non-null  object
 4   masked_class           25848 non-null  object
 5   func_body              25848 non-null  object
 6   len_input              25848 non-null  int64 
 7   len_output             25848 non-null  int64 
 8   total                  25848 non-null  int64 
 9   inherit_elements       25848 non-null  object
 10  method_qualified_name  25848 non-null  object
dtypes: int64(3), object(8)
memory usage: 2.2+ MB
None
          len_input    len_output         total
count  25848.000000  25848.000000  25848.000000
mean     473.282149    140.087241    613.369390
std      429.895261    188.12

In [3]:
import missingno as msno

# Visualize missing data
msno.matrix(df)
msno.heatmap(df)

ModuleNotFoundError: No module named 'missingno'

In [None]:
import matplotlib.pyplot as plt

# Plot histograms
df[['len_input', 'len_output', 'total']].hist(figsize=(10, 10), bins=30)
plt.show()

In [None]:
import seaborn as sns

# Plot count plots for categorical features
categorical_cols = ['proj_name', 'relative_path', 'class_name', 'func_name', 'masked_class', 'inherit_elements', 'method_qualified_name']

for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    sns.countplot(y=col, data=df)
    plt.show()

In [None]:
# Correlation matrix
corr = df[['len_input', 'len_output', 'total']].corr()

# Plot a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

In [None]:
sns.pairplot(df[['len_input', 'len_output', 'total']])
plt.show()

In [None]:
# Plot box plots
for col in ['len_input', 'len_output', 'total']:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=df[col])
    plt.show()

In [None]:
# Value counts
for col in categorical_cols:
    print(df[col].value_counts())
    print('\n')

In [None]:
# Violin plots
for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    sns.violinplot(x=col, y='total', data=df)  # 'total' is used as an example for numerical column
    plt.show()

# Scatter plots for numerical features
sns.scatterplot(x='len_input', y='len_output', data=df)
plt.show()
