In [54]:
import pandas as pd
import os, shutil

In [55]:
df = pd.read_csv('../data/filtered_data.csv')

In [56]:
targets = ["Address", "Model Class", "Futile Abstract Pipeline", "Schizofrenic Class", "Data Class"]

In [57]:
mask = df[targets].any(axis=1)

# Filtering the DataFrame using the mask
filtered_df = df[targets]
filtered_df = filtered_df[mask]

In [58]:
model_class_df = filtered_df[filtered_df["Model Class"] == True]
model_class_df = model_class_df[["Address", "Model Class"]]

futile_abstract_pipeline_df = filtered_df[filtered_df["Futile Abstract Pipeline"] == True]
futile_abstract_pipeline_df = futile_abstract_pipeline_df[["Address", "Futile Abstract Pipeline"]]

schizofrenic_class_df = filtered_df[filtered_df["Schizofrenic Class"] == True]
schizofrenic_class_df = schizofrenic_class_df[["Address", "Schizofrenic Class"]]

data_class_df = filtered_df[filtered_df["Data Class"] == True]
data_class_df = data_class_df[["Address", "Data Class"]]

classes = [model_class_df, futile_abstract_pipeline_df, schizofrenic_class_df, data_class_df]

In [59]:
model_class_df.head()

Unnamed: 0,Address,Model Class
0,org.apache.hadoop.metrics2.sink.SqlServerSinkT...,True
1,org.apache.ambari.TestMapReduceJobHistoryUpdat...,True
2,org.apache.ambari.log4j.hadoop.mapreduce.jobhi...,True
3,org.apache.ambari.log4j.hadoop.mapreduce.jobhi...,True
4,org.apache.ambari.log4j.common.store.TestDatab...,True


In [60]:
small_files = {}

for cls in classes:
    min_chars = float('inf')
    smallest_file = None
    for address in cls['Address']:
        try:
            with open(f"../data/code_files/{address}", 'r', encoding='utf-8') as file:
                content = file.read()
                char_count = len(content)

                if char_count < min_chars and char_count > 1000:
                    min_chars = char_count
                    smallest_file = address

        except FileNotFoundError:
            print(f"File not found: {address}")
        except Exception as e:
            print(f"Error reading file {address}: {e}")
    small_files[cls.columns[1]] = smallest_file

In [61]:
small_files

{'Model Class': 'org.apache.fineract.portfolio.loanaccount.handler.BulkUpdateLoanOfficerCommandHandler.java',
 'Futile Abstract Pipeline': 'com.esri.core.geometry.OperatorGeodeticLength.java',
 'Schizofrenic Class': 'org.apache.ambari.server.controller.ViewUrlResponseSwagger.java',
 'Data Class': 'org.apache.openjpa.persistence.inheritance.entity.SubclassH.java'}

In [62]:
os.makedirs('small_files', exist_ok=True)

for cls_name, file_address in small_files.items():
    destination_dir = f'small_files/{cls_name}'
    os.makedirs(destination_dir, exist_ok=True)
    try:
        # Construct the full destination path
        file_path = f"../data/code_files/{file_address}"
        dest_path = os.path.join(destination_dir, file_address)
        # Copy the file
        shutil.copy(file_path, dest_path)
        print(f"Copied {file_path} to {dest_path}")
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error copying file {file_path}: {e}")

Copied ../data/code_files/org.apache.fineract.portfolio.loanaccount.handler.BulkUpdateLoanOfficerCommandHandler.java to small_files/Model Class/org.apache.fineract.portfolio.loanaccount.handler.BulkUpdateLoanOfficerCommandHandler.java
Copied ../data/code_files/com.esri.core.geometry.OperatorGeodeticLength.java to small_files/Futile Abstract Pipeline/com.esri.core.geometry.OperatorGeodeticLength.java
Copied ../data/code_files/org.apache.ambari.server.controller.ViewUrlResponseSwagger.java to small_files/Schizofrenic Class/org.apache.ambari.server.controller.ViewUrlResponseSwagger.java
Copied ../data/code_files/org.apache.openjpa.persistence.inheritance.entity.SubclassH.java to small_files/Data Class/org.apache.openjpa.persistence.inheritance.entity.SubclassH.java
