In [9]:
import pandas as pd

# Load the ELLIPSE Final Github Test dataset
df_ellipse = pd.read_csv('ELLIPSE_Final_github_test.csv')

# Standardize column names to lowercase
rename_map = {
    'Cohesion': 'cohesion',
    'Syntax': 'syntax',
    'Vocabulary': 'vocabulary',
    'Phraseology': 'phraseology',
    'Grammar': 'grammar',
    'Conventions': 'conventions'
}
df_ellipse = df_ellipse.rename(columns=rename_map)

print(f"Shape of ELLIPSE github dataset: {df_ellipse.shape}")
print("Columns:", df_ellipse.columns.tolist())
print("\nFirst few rows:")
display(df_ellipse.head())

# Analyze missing values
missing_values = df_ellipse.isnull().sum()
missing_percent = (missing_values / len(df_ellipse)) * 100

print("\nMissing Values Analysis:")
if missing_values.sum() == 0:
    print("No missing values found in the dataset.")
else:
    # Create a summary dataframe for missing values
    missing_info = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percent
    })
    # Filter to show only columns with missing values
    missing_info = missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Values', ascending=False)
    display(missing_info)

Shape of ELLIPSE github dataset: (2571, 26)
Columns: ['text_id_kaggle', 'full_text', 'gender', 'grade', 'race_ethnicity', 'num_words', 'num_words2', 'num_words3', 'num_sent', 'num_para', 'num_word_div_para', 'MTLD', 'TTR', 'Type', 'Token', 'task', 'SES', 'prompt', 'Overall', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions', 'set']

First few rows:


Unnamed: 0,text_id_kaggle,full_text,gender,grade,race_ethnicity,num_words,num_words2,num_words3,num_sent,num_para,...,SES,prompt,Overall,cohesion,syntax,vocabulary,phraseology,grammar,conventions,set
0,26650408983.0,People are discussing about the limitation of ...,Female,11,Hispanic/Latino,486,556,552,12,9,...,Not economically disadvantaged,Impact of technology,2.5,2.5,2.0,3.0,2.0,2.5,3.0,test
1,60346336449.0,"As being the youngest child, I have gained wis...",Male,8,Asian/Pacific Islander,405,434,427,15,5,...,Not economically disadvantaged,Learning from the experience of others,3.5,3.5,3.5,3.0,3.5,3.5,3.5,test
2,92767513178.0,I think people could do one of both if they ne...,Male,11,Black/African American,266,266,265,8,5,...,Economically disadvantaged,Self-reliance,2.5,2.0,2.0,3.0,2.0,2.5,3.0,test
3,199000000000.0,The success is depends on your attitude\n\nYes...,Male,12,Asian/Pacific Islander,498,471,466,19,12,...,Not economically disadvantaged,Positive attitudes,3.5,4.0,3.5,4.0,3.0,3.0,3.5,test
4,245000000000.0,To be somebody in a world that would make some...,Male,12,Asian/Pacific Islander,265,253,251,14,5,...,Not economically disadvantaged,Individuality,3.0,3.0,3.0,3.5,3.5,3.0,3.0,test



Missing Values Analysis:


Unnamed: 0,Missing Values,Percentage
SES,1,0.038895


In [None]:
# Verify if 'Overall' is the average of the 6 metric columns
metric_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

# Calculate the mean of the metric columns and round to 1 decimal place
calculated_overall = df_ellipse[metric_cols].mean(axis=1).round(1)

# specific check
print(f"Metrics used: {metric_cols}")
print("Applied rounding to 1 decimal place for calculated average.")

# Compare with the existing 'Overall' column
# We use a small tolerance for floating point comparisons
comparison_diff = (df_ellipse['Overall'] - calculated_overall).abs()

print(f"Max difference between 'Overall' and calculated average: {comparison_diff.max()}")

if comparison_diff.max() < 1e-6:
    print("Conclusion: The 'Overall' column IS the rounded average (1 decimal) of the metric columns.")
else:
    print("Conclusion: The 'Overall' column is NOT exactly the rounded average of the metric columns.")
    # Show mismatches if any
    mismatches = df_ellipse[comparison_diff > 1e-6]
    print(f"Number of mismatches: {len(mismatches)}")
    if not mismatches.empty:
        print("Sample mismatches:")
        display(mismatches[metric_cols + ['Overall']].join(calculated_overall.rename('Calculated_Average')).head())

Metrics used: ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
Max difference between 'Overall' and calculated average: 0.8333333333333335
Conclusion: The 'Overall' column is NOT exactly the average of the metric columns.
Number of mismatches: 2238
Sample mismatches:


Unnamed: 0,cohesion,syntax,vocabulary,phraseology,grammar,conventions,Overall,Calculated_Average
1,3.5,3.5,3.0,3.5,3.5,3.5,3.5,3.416667
2,2.0,2.0,3.0,2.0,2.5,3.0,2.5,2.416667
4,3.0,3.0,3.5,3.5,3.0,3.0,3.0,3.166667
5,3.5,4.0,4.0,3.5,4.0,3.0,3.5,3.666667
6,3.5,3.0,3.0,3.0,3.5,3.5,3.5,3.25


In [7]:
import pandas as pd

# Load the ELLIPSE train dataset
# Assuming train.csv is in the same directory as this notebook
df_train = pd.read_csv('train.csv')

print(f"Shape of ELLIPSE train dataset: {df_train.shape}")
print("\nFirst few rows:")
display(df_train.head())

# Analyze missing values
missing_values = df_train.isnull().sum()
missing_percent = (missing_values / len(df_train)) * 100

print("\nMissing Values Analysis:")
if missing_values.sum() == 0:
    print("No missing values found in the dataset.")
else:
    # Create a summary dataframe for missing values
    missing_info = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_percent
    })
    # Filter to show only columns with missing values
    missing_info = missing_info[missing_info['Missing Values'] > 0].sort_values('Missing Values', ascending=False)
    display(missing_info)

Shape of ELLIPSE train dataset: (3911, 8)

First few rows:


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5



Missing Values Analysis:
No missing values found in the dataset.
