In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
anime_df = pd.read_csv('anime_data.csv')
reviews_df = pd.read_csv('anime_reviews.csv')

In [3]:
# Check the shape of the dataframes
print(f"Anime dataset shape: {anime_df.shape}")
print(f"Reviews dataset shape: {reviews_df.shape}")

# Look at the first few rows
print("\nFirst few rows of anime_df:")
print(anime_df.head())

# Check data types
print("\nData types in anime_df:")
print(anime_df.dtypes)

# Check for missing values
print("\nMissing values in anime_df:")
print(anime_df.isnull().sum())

Anime dataset shape: (125, 23)
Reviews dataset shape: (360, 4)

First few rows of anime_df:
      id                                 title  \
0  61443  Shunkashuutou Daikousha: Haru no Mai   
1  16498                    Shingeki no Kyojin   
2   1535                            Death Note   
3   5114      Fullmetal Alchemist: Brotherhood   
4  30276                         One Punch Man   

                      english_title type  episodes           status  airing  \
0                               NaN   TV       NaN    Not yet aired   False   
1                   Attack on Titan   TV      25.0  Finished Airing   False   
2                        Death Note   TV      37.0  Finished Airing   False   
3  Fullmetal Alchemist: Brotherhood   TV      64.0  Finished Airing   False   
4                     One Punch Man   TV      12.0  Finished Airing   False   

                  aired_from                   aired_to       duration  ...  \
0                        NaN                        N

In [4]:
anime_df['score'] = pd.to_numeric(anime_df['score'], errors='coerce')
anime_df['episdoes'] = pd.to_numeric(anime_df['episodes'], errors='coerce')
anime_df['year'] = pd.to_numeric(anime_df['year'], errors='coerce')

print("Updated data types:")
print(anime_df[['score', 'episodes', 'year']].dtypes)

Updated data types:
score       float64
episodes    float64
year        float64
dtype: object


In [5]:
print("Missing values after conversion:")
print(anime_df[['score', 'episodes', 'year']].isnull().sum())

Missing values after conversion:
score       1
episodes    2
year        8
dtype: int64


In [6]:
anime_df['score'] = anime_df['score'].fillna(anime_df['score'].mean())

In [7]:
print("Missing values after conversion:")
print(anime_df[['score', 'episodes', 'year']].isnull().sum())

Missing values after conversion:
score       0
episodes    2
year        8
dtype: int64


In [8]:
anime_df['episodes'] = anime_df['episodes'].fillna(0)

In [9]:
print("Missing values after conversion:")
print(anime_df[['score', 'episodes', 'year']].isnull().sum())

Missing values after conversion:
score       0
episodes    0
year        8
dtype: int64


In [10]:
print(f"Missing year values: {anime_df['year'].isnull().sum()}")

Missing year values: 8


In [11]:
print("\nMissing values after handling:")
print(anime_df[['score', 'episodes', 'year']].isnull().sum())


Missing values after handling:
score       0
episodes    0
year        8
dtype: int64


In [12]:
print("sample value from genres column:")
print(anime_df['genres'].iloc[0])
print(type(anime_df['genres'].iloc[0]))

try:
    anime_df['genres'] = anime_df['genres'].apply(eval)
    anime_df['studios'] = anime_df['studios'].apply(eval)
    anime_df['themes'] = anime_df['themes'].apply(eval)
    print("\nAfter conversion:")
    print(anime_df['genres'].iloc[0])
    print(type(anime_df['genres'].iloc[0]))

except Exception as e:
    print(f"Error during conversion: {e}")
    print("Let's check a few samples to understand the structure:")
    for i in range(3):
        print(f"Row {i}, genres: {anime_df['genres'].iloc[i]}, type: {type(anime_df['genres'].iloc[i])}")

sample value from genres column:
['Drama', 'Romance']
<class 'str'>

After conversion:
['Drama', 'Romance']
<class 'list'>


In [13]:
print("Studios column type check:")
print(anime_df['studios'].iloc[0])
print(type(anime_df['studios'].iloc[0]))

print("\nThemes column type check:")
print(anime_df['themes'].iloc[0])
print(type(anime_df['themes'].iloc[0]))

Studios column type check:
[]
<class 'list'>

Themes column type check:
[]
<class 'list'>


In [14]:
print("Dataset Statistics:")
print(f"Total anime entries: {len(anime_df)}")
print(f"Year range: {anime_df['year'].min()} to {anime_df['year'].max()}")
print(f"Average score: {anime_df['score'].mean():.2f}")
print(f"Most common type: {anime_df['type'].value_counts().index[0]}")

print("\nAdditional Statistics:")
print("\nDistribution of anime types:")
type_counts = anime_df['type'].value_counts()
for anime_type, count in type_counts.items():
    percentage = (count / len(anime_df)) * 100
    print(f" {anime_type}: {count} ({percentage:.1f}%)")

print("\nScore distribution:")
print(f" Minimum score: {anime_df['score'].min():.2f}")
print(f" Maximum score: {anime_df['score'].max():.2f}")
print(f" Median score: {anime_df['score'].median():.2f}")
print(f" Standard deviation: {anime_df['score'].std():.2f}")

#analyze release years
print("\nRelase year analysis:")
anime_df['decade'] = (anime_df['year'] // 10) * 10
decade_count = anime_df['decade'].value_counts().sort_index()
print("anime count by decade:")
for decade, count in decade_count.items():
    print(f" {int(decade)}s: {count} anime")

Dataset Statistics:
Total anime entries: 125
Year range: 1989.0 to 2022.0
Average score: 8.14
Most common type: TV

Additional Statistics:

Distribution of anime types:
 TV: 118 (94.4%)
 Movie: 6 (4.8%)
 ONA: 1 (0.8%)

Score distribution:
 Minimum score: 6.37
 Maximum score: 9.10
 Median score: 8.14
 Standard deviation: 0.52

Relase year analysis:
anime count by decade:
 1980s: 1 anime
 1990s: 3 anime
 2000s: 19 anime
 2010s: 83 anime
 2020s: 11 anime


In [15]:
# analyze episdoes vs rating
print("\nEpisode count vs ratings:")
anime_df['episode_category'] = pd.cut(
    anime_df['episodes'],
    bins=[0, 1, 12, 24, 50, 100, 1000],
    labels=['Unknown', '1 (Movie/OVA)', '2-12 (Short)', '13-24 (Standard)', '25-50 (Long)', '51+ (Very Long)']
)
episode_scores = anime_df.groupby('episode_category', observed=False)['score'].agg(['mean', 'count']).reset_index()
print(episode_scores)


Episode count vs ratings:
   episode_category      mean  count
0           Unknown  8.736667      6
1     1 (Movie/OVA)  8.032973     37
2      2-12 (Short)  8.082273     44
3  13-24 (Standard)  8.180000     25
4      25-50 (Long)  8.485000      4
5   51+ (Very Long)  8.170000      7


In [16]:
#show top rated anime
print("\nTop 10 highest rated anime:")
top_anime = anime_df.sort_values('score', ascending=False).head(10)
for i, (_, anime) in enumerate(top_anime.iterrows(), 1):
    print(f"{i}. {anime['title']} ({anime['year']}) - Score: {anime['score']:.2f}")


Top 10 highest rated anime:
1. Fullmetal Alchemist: Brotherhood (2009.0) - Score: 9.10
2. Steins;Gate (2011.0) - Score: 9.07
3. Shingeki no Kyojin Season 3 Part 2 (2019.0) - Score: 9.05
4. Hunter x Hunter (2011) (2011.0) - Score: 9.03
5. Koe no Katachi (nan) - Score: 8.93
6. Clannad: After Story (2008.0) - Score: 8.93
7. Code Geass: Hangyaku no Lelouch R2 (2008.0) - Score: 8.91
8. Monster (2004.0) - Score: 8.88
9. Kimi no Na wa. (nan) - Score: 8.83
10. Shingeki no Kyojin: The Final Season (2021.0) - Score: 8.78


In [17]:
all_genres = []
for genres in anime_df['genres']:
    all_genres.extend(genres)

unique_genres = list(set(all_genres))
print(f"Number of unique genres found: {len(unique_genres)}")
print(f"First 5 unique genres: {unique_genres[:5]}")

for genre in unique_genres:
    anime_df[f'genre_{genre}'] = anime_df['genres'].apply(lambda x: 1 if genre in x else 0)

sample_anime = anime_df.iloc[0]
print(f"\nSample anime: {sample_anime['title']}")
print(f"Original genres: {sample_anime['genres']}")
for genre in unique_genres:
    if sample_anime[f"genre_{genre}"] == 1:
        print(f" - genre_{genre}: {sample_anime[f'genre_{genre}']}")

Number of unique genres found: 17
First 5 unique genres: ['Comedy', 'Horror', 'Ecchi', 'Award Winning', 'Adventure']

Sample anime: Shunkashuutou Daikousha: Haru no Mai
Original genres: ['Drama', 'Romance']
 - genre_Drama: 1
 - genre_Romance: 1


In [18]:
#create popularity score
if 'popularity' in anime_df.columns:
    anime_df['popularity_score'] = 1 / anime_df['popularity']
    top_popular = anime_df.sort_values('popularity_score', ascending=False).head(5)
    print('\nTop 5 most popular anime by converted poularity score:')
    for i, (_, anime) in enumerate(top_popular.iterrows(), 1):
        print(f"{i}. {anime['title']} - Orignal Rank: {anime['popularity']}, Popularity score: {anime['popularity_score']:.6f}")


Top 5 most popular anime by converted poularity score:
1. Shunkashuutou Daikousha: Haru no Mai - Orignal Rank: 0, Popularity score: inf
2. Shingeki no Kyojin - Orignal Rank: 1, Popularity score: 1.000000
3. Death Note - Orignal Rank: 2, Popularity score: 0.500000
4. Fullmetal Alchemist: Brotherhood - Orignal Rank: 3, Popularity score: 0.333333
5. One Punch Man - Orignal Rank: 4, Popularity score: 0.250000


In [19]:
if 'popularity_score' in anime_df.columns:
    min_pop = anime_df['popularity_score'].min()
    max_pop = anime_df['popularity_score'].max()
    anime_df['popularity_normalized'] = 100 * (anime_df['popularity_score'] - min_pop) / (max_pop - min_pop)

    print("\nPopularity score normalization:")
    print(f"Original score range: {min_pop:.6f} to {max_pop:.6f}")
    print("Normalized scores for sample anime:")
    sample_anime = anime_df.sort_values('popularity_normalized', ascending=False).head(3)
    for _, anime in sample_anime.iterrows():
        print(f" {anime['title']}: {anime['popularity_normalized']:.2f}/100")


Popularity score normalization:
Original score range: 0.008065 to inf
Normalized scores for sample anime:
 Shingeki no Kyojin: 0.00/100
 Death Note: 0.00/100
 Fullmetal Alchemist: Brotherhood: 0.00/100


In [20]:
if 'popularity' in anime_df.columns:
    zero_count = (anime_df['popularity'] == 0).sum()
    na_count = anime_df['popularity'].isna().sum()
    print(f"zeroes in popularity column: {zero_count}")
    print(f"NaN values in popularity column: {na_count}")
    anime_df['popularity_score'] = anime_df['popularity'].apply(
        lambda x: 1 / x if x > 0 else 0
    )
    inf_count = np.isinf(anime_df['popularity_score']).sum()
    print(f"Infinite values in popularity_score: {inf_count}")

    anime_df['popularity_score'].replace([np.inf, -np.inf], 0, inplace=True)

    min_pop = anime_df['popularity_score'].min()
    max_pop = anime_df['popularity_score'].max()

    anime_df['popularity_normalized'] = 100 * (anime_df['popularity_score'] - min_pop) / (max_pop - min_pop)

    print("\nUpdated popularity score normalization:")
    print(f"score range: {min_pop:.6f} to {max_pop:.6f}")
    print("No infinite values:", not np.any(np.isinf(anime_df['popularity_normalized'])))

zeroes in popularity column: 1
NaN values in popularity column: 0
Infinite values in popularity_score: 0

Updated popularity score normalization:
score range: 0.000000 to 1.000000
No infinite values: True


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  anime_df['popularity_score'].replace([np.inf, -np.inf], 0, inplace=True)


In [21]:
#create decode feature
if 'year' in anime_df.columns:
    anime_df['decade'] = (anime_df['year'] // 10) * 10
    current_year = 2025
    anime_df['is_recent'] = (current_year - anime_df['year'] <= 5).astype(int)

    print("\nYear-based features:")
    print("Anime counts by decade:")
    decade_count = anime_df['decade'].value_counts().sort_index()
    for decade, count in decade_count.items():
        if not pd.isna(decade):
            print(f" {int(decade)}s: {count} anime")

    print(f"\nRecent anime count (last 5 years): {anime_df['is_recent'].sum()}")


Year-based features:
Anime counts by decade:
 1980s: 1 anime
 1990s: 3 anime
 2000s: 19 anime
 2010s: 83 anime
 2020s: 11 anime

Recent anime count (last 5 years): 11


In [22]:
if all(col in anime_df.columns for col in ['score', 'popularity_normalized']):
    pop_10 = anime_df['popularity_normalized'] / 10
    anime_df['composite_score'] = (0.7 * anime_df['score']) + (0.3 * pop_10)
    top_composite = anime_df.sort_values('composite_score', ascending=False).head(10)
    print("\nTop 10 anime by composite score (rating + popularity):")
    for i, (_, anime) in enumerate(top_composite.iterrows(), 1):
        print(f"{i}. {anime['title']} - Score: {anime['score']:.2f}, Pop: {pop_10.iloc[_]:.2f}, cOmposite: {anime['composite_score']:.2f}")


Top 10 anime by composite score (rating + popularity):
1. Shingeki no Kyojin - Score: 8.56, Pop: 10.00, cOmposite: 8.99
2. Death Note - Score: 8.62, Pop: 5.00, cOmposite: 7.53
3. Fullmetal Alchemist: Brotherhood - Score: 9.10, Pop: 3.33, cOmposite: 7.37
4. Hunter x Hunter (2011) - Score: 9.03, Pop: 1.25, cOmposite: 6.70
5. One Punch Man - Score: 8.49, Pop: 2.50, cOmposite: 6.69
6. Steins;Gate - Score: 9.07, Pop: 0.71, cOmposite: 6.56
7. Kimetsu no Yaiba - Score: 8.44, Pop: 2.00, cOmposite: 6.51
8. Shingeki no Kyojin Season 3 Part 2 - Score: 9.05, Pop: 0.48, cOmposite: 6.48
9. Kimi no Na wa. - Score: 8.83, Pop: 0.91, cOmposite: 6.45
10. Koe no Katachi - Score: 8.93, Pop: 0.50, cOmposite: 6.40


In [23]:
#create studio-based features
if 'studios' in anime_df.columns:
    all_studios = []
    for studios_list in anime_df['studios']:
        if isinstance(studios_list, list) and len(studios_list) > 0:
            all_studios.extend(studios_list)

    unique_studios = list(set(all_studios))
    print(f"Number of unique studios found: {len(unique_studios)}")

    #updating features for studios with atleast 3 anime
    studio_counts = {}
    for studio in unique_studios:
        count = sum(1 for studios in anime_df['studios'] if studio in studios)
        studio_counts[studio] = count

    common_studios = [studio for studio, count in studio_counts.items() if count >= 3]
    print(f"Creating one-hot features for {len(common_studios)} studios with >3 anime")

    for studio in common_studios:
        anime_df[f'studio_{studio}'] = anime_df['studios'].apply(lambda x: 1 if studio in x else 0)

    #creating average scores for studios
    studio_scores = {}
    for studio in common_studios:
        studio_anime = anime_df[anime_df[f'studio_{studio}'] == 1]
        avg_score = studio_anime['score'].mean()
        studio_scores[studio] = avg_score

    sorted_studios = sorted(studio_scores.items(), key=lambda x: x[1], reverse=True)
    top_studios = [studio for studio, _ in sorted_studios[:len(sorted_studios)//4]]
    anime_df['prestigious_studio'] = anime_df['studios'].apply(lambda x: 1 if any(studio in top_studios for studio in x) else 0)

    print(f"Identified {len(top_studios)} prestigious studios")
    print("Top 5 prestigious studios:")
    for studio, score in sorted_studios[:5]:
        print(f" {studio}: average score {score:.2f}")

Number of unique studios found: 37
Creating one-hot features for 17 studios with >3 anime
Identified 4 prestigious studios
Top 5 prestigious studios:
 Sunrise: average score 8.79
 Studio Ghibli: average score 8.70
 Wit Studio: average score 8.50
 ufotable: average score 8.50
 Kyoto Animation: average score 8.32


In [27]:
# create theme-based features
if 'themes' in anime_df.columns:
    all_themes = []
    for themes_list in anime_df['themes']:
        if isinstance(themes_list, list):
            all_themes.extend(themes_list)
    unique_themes = list(set(all_themes))
    print(f"\nNumber of unique themes found: {len(unique_themes)}")

    theme_counts = {}
    for theme in unique_themes:
        count = sum(1 for themes in anime_df['themes'] if theme in themes)
        theme_counts[theme] = count

    common_themes = [theme for theme, count in theme_counts.items() if count >= 5]
    print(f"Creating one-hot features for {len(common_themes)} themes with > 5 anime")

    for theme in common_themes:
        anime_df[f'theme_{theme}'] = anime_df['themes'].apply(lambda x: 1 if theme in x else 0)


Number of unique themes found: 32
Creating one-hot features for 13 themes with > 5 anime


In [28]:
#create features from episode count and duration
if 'episodes' in anime_df.columns:
    conditions = [
        (anime_df['episodes'] == 1),
        (anime_df['episodes'] > 1) & (anime_df['episodes'] <= 6),
        (anime_df['episodes'] > 6) & (anime_df['episodes'] <= 13),
        (anime_df['episodes'] > 13) & (anime_df['episodes'] <= 26),
        (anime_df['episodes'] > 26) & (anime_df['episodes'] <= 52),
        (anime_df['episodes'] > 52) & (anime_df['episodes'] <= 100),
        (anime_df['episodes'] > 100)
    ]
    choices = ['Movie/Special', 'Very Short', 'Short', 'Standard', 'Long', 'Very Long', 'Extended']
    anime_df['length_category'] = np.select(conditions, choices, default='Unkown')

    for category in choices:
        anime_df[f'length_{category}'] = (anime_df['length_category'] == category).astype(int)

    anime_df['log_episodes'] = np.log1p(anime_df['episodes'])
    print("\nEpisodes length categories:")
    for category in choices:
        count = (anime_df['length_category'] == category).sum()
        print(f" {category}: {count} anime")


Episodes length categories:
 Movie/Special: 6 anime
 Very Short: 0 anime
 Short: 53 anime
 Standard: 51 anime
 Long: 4 anime
 Very Long: 2 anime
 Extended: 7 anime


In [30]:
if 'genres' in anime_df.columns:
    genre_pairs = [
        ('Action', 'Adventure'),
        ('Romance', 'Comedy'),
        ('Drama', 'Romance'),
        ('Action', 'Fantasy'),
        ('Mystery', 'Supernatural'),
        ('Comedy', 'Slice of Life')
    ]
    print("\nCreating genre combination features:")
    for genre1, genre2 in genre_pairs:
        feature_name = f'genres_{genre1}_{genre2}'
        if f'genre_{genre1}' in anime_df.columns and f'genre_{genre2}' in anime_df.columns:
            anime_df[feature_name] = (anime_df[f'genre_{genre1}'] & anime_df[f'genre_{genre2}']).astype(int)
            count = anime_df[feature_name].sum()
            print(f" {genre1} + {genre2}: {count} anime")

    anime_df['genre_count'] = anime_df['genres'].apply(len)
    print(f"\nGenre count statistics:")
    print(f" Min {anime_df['genre_count'].min()}")
    print(f" Max: {anime_df['genre_count'].max()}")
    print(f" Average: {anime_df['genre_count'].mean():.2f}")


Creating genre combination features:
 Action + Adventure: 22 anime
 Romance + Comedy: 6 anime
 Drama + Romance: 10 anime
 Action + Fantasy: 28 anime
 Mystery + Supernatural: 3 anime
 Comedy + Slice of Life: 0 anime

Genre count statistics:
 Min 1
 Max: 6
 Average: 2.75


In [31]:
# Create more time-based features
if 'year' in anime_df.columns:
    current_year = 2025
    
    anime_df['age'] = current_year - anime_df['year']
    
    conditions = [
        (anime_df['year'] < 1980),
        (anime_df['year'] >= 1980) & (anime_df['year'] < 1990),
        (anime_df['year'] >= 1990) & (anime_df['year'] < 2000),
        (anime_df['year'] >= 2000) & (anime_df['year'] < 2010),
        (anime_df['year'] >= 2010) & (anime_df['year'] < 2020),
        (anime_df['year'] >= 2020)
    ]
    choices = ['Classic', '80s', '90s', '2000s', '2010s', 'Current']
    anime_df['era'] = np.select(conditions, choices, default='Unknown')
    
    for era in choices:
        anime_df[f'era_{era}'] = (anime_df['era'] == era).astype(int)
    
    print("\nAnime by era:")
    for era in choices:
        count = (anime_df['era'] == era).sum()
        print(f"  {era}: {count} anime")


Anime by era:
  Classic: 0 anime
  80s: 1 anime
  90s: 3 anime
  2000s: 19 anime
  2010s: 83 anime
  Current: 11 anime


In [32]:
# Create text-based features
if 'synopsis' in anime_df.columns:
    anime_df['synopsis_length'] = anime_df['synopsis'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)
    
    keywords = ['tragedy', 'comedy', 'epic', 'journey', 'battle', 'school', 'love', 'magic', 'robot', 'war']
    
    for keyword in keywords:
        anime_df[f'synopsis_has_{keyword}'] = anime_df['synopsis'].apply(
            lambda x: 1 if pd.notna(x) and keyword.lower() in str(x).lower() else 0
        )
    
    print("\nSynopsis keyword features created:")
    for keyword in keywords:
        count = anime_df[f'synopsis_has_{keyword}'].sum()
        print(f"  'synopsis_has_{keyword}': {count} anime")


Synopsis keyword features created:
  'synopsis_has_tragedy': 2 anime
  'synopsis_has_comedy': 2 anime
  'synopsis_has_epic': 4 anime
  'synopsis_has_journey': 13 anime
  'synopsis_has_battle': 18 anime
  'synopsis_has_school': 48 anime
  'synopsis_has_love': 14 anime
  'synopsis_has_magic': 10 anime
  'synopsis_has_robot': 4 anime
  'synopsis_has_war': 36 anime


In [33]:
# Check correlation of new features with score
numeric_features = anime_df.select_dtypes(include=['number']).columns
feature_correlations = anime_df[numeric_features].corr()['score'].sort_values(ascending=False)

print("\nTop 10 features most correlated with score:")
for feature, corr in feature_correlations.head(11).items():
    if feature != 'score': 
        print(f"  {feature}: {corr:.3f}")

print("\nBottom 5 features least correlated with score:")
for feature, corr in feature_correlations.tail(5).items():
    print(f"  {feature}: {corr:.3f}")


Top 10 features most correlated with score:
  composite_score: 0.804
  favorites: 0.518
  genre_Award Winning: 0.342
  prestigious_studio: 0.336
  theme_Military: 0.273
  length_Movie/Special: 0.255
  genre_Drama: 0.252
  members: 0.244
  scored_by: 0.235
  length_Very Long: 0.207

Bottom 5 features least correlated with score:
  genre_Horror: -0.370
  rank: -0.879
  length_Very Short: nan
  genres_Comedy_Slice of Life: nan
  era_Classic: nan
