In [1]:
import pandas as pd
import re
from utils import *

# 어벤져스 1편

## 데이터 불러오기

In [2]:
path = "./Data/Avengers1.txt"

script_by_scene = get_script_by_scene(path=path)

In [3]:
len(script_by_scene)

98

In [4]:
script_by_scene[1].split("\n")

['01',
 '"And there came a day, a day unlike any other, when Earth\'s mightiest heroes and heroines found themselves united against a common threat. On that day, the Avengers were born--to fight the foes no single superhero could withstand! Through the years, their roster has prospered, changing many times, but their glory has never been denied! Heed the call, then--for now, the Avengers Assemble!"',
 '[BURNING BLUE FLAMES. A smoky cube shape emerges - THE TESSERACT. Filling the screen with BLACKNESS.]',
 'CUT TO: EXT. THRONE ROOM, SPACE ¬ NIGHT',
 '[Kneeling behind a THRONE, a clothed, armored figure known as THE OTHER.]',
 'THE OTHER: The tesseract has awakened. It is on a little world. A human world. They would wield its power,... [he holds out the scepter]',
 'Loki: [Coming out from the shadows, taking the staff]',
 'THE OTHER: But our ally knows its workings as they never will. He is ready to lead. And our force, our Chitauri, will follow.',
 '[thousands of Chitauri are awaiting o

## 가중치 데이터프레임 생성

In [5]:
df = make_weight_df(script_by_scene=script_by_scene)
df

Unnamed: 0,Source,Target,Type,weight
0,CUT TO,THE OTHER,Undirected,1
1,THE OTHER,LOKI,Undirected,10
2,THE OTHER,THE OTHER,Undirected,1
3,NICK FURY,AGENT PHIL COULSON,Undirected,5
4,AGENT PHIL COULSON,COULSON,Undirected,1
...,...,...,...,...
167,[TIGHT ON IRON MAN,THOR,Undirected,1
168,OLD MAN (STAN LEE),SENATOR BOYNTON,Undirected,1
169,SENATOR BOYNTON,WAITRESS,Undirected,1
170,3,NICK FURY,Undirected,1


## 노드 (캐릭터) 명 전처리

In [6]:
set(list(df.Source.unique()) + list(df.Target.unique()))

{'1',
 '2',
 '3',
 'AGENT JASPER SITWELL',
 'AGENT JASPER SITWELL (O.S.)',
 'AGENT JASPER SITWELL (V.O.)',
 'AGENT MARIA HILL',
 'AGENT MARIA HILL TURN UP THAT ENGINE! NUMBER 3 ENGINE IS',
 'AGENT PHIL COULSON',
 'ATTENDING WOMAN',
 'BANNER',
 'BANNER (COMING TO)',
 'BARTON',
 'BLACK WIDOW',
 'BLACK WIDOW (V.O.)',
 'BRUCE',
 'CAPTAIN AMERICA',
 'CAPTAIN AMERICA (V.O.)',
 'CLINT',
 'CLINT BARTON',
 'COULSON',
 'COUNCIL MEMBER 1',
 'COUNCIL MEMBER 2',
 'CUT TO',
 'DR. ERIK SELVIG',
 'ELDER GERMAN MAN',
 'ESCORT 0-6 PILOT (V.O.)',
 'ESCORT 606 PILOT',
 'FURY',
 'GALAGA PLAYER',
 'GENERAL LUCHKOV',
 'HAWKEYE',
 'HAWKEYE (V.O.)',
 'HELMSMAN',
 'HILL',
 'HULK',
 'IRON MAN',
 'IRON MAN (V.O.)',
 'JARVIS',
 'LITTLE GIRL',
 'LOKI',
 'LOKI (V.O.)',
 'LUCHKOV',
 'NASA SCIENCETIST',
 'NATASHA',
 'NATASHA ROMANOFF',
 'NICK FURY',
 'NICK FURY (O.S.)',
 'NICK FURY (V.O.)',
 'NICK FURY THE TESSERACT IS WHERE IT BELONGS',
 'OLD MAN (STAN LEE)',
 'PEGGY CARTER',
 'PEPPER',
 'PEPPER POTTS',
 'PILOT',
 'P

In [7]:
# 동일한 인물명 지정
synonyms = {
    'AGENT JASPER SITWELL': 'SITWELL', 
    'AGENT MARIA HILL': 'MARIA HILL',
    'AGENT MARIA HILL TURN UP THAT ENGINE! NUMBER  ENGINE IS': "MARIA HILL",
    'AGENT PHIL COULSON': 'COULSON',
    'BARTON': 'HAWKEYE',
    'CLINT': 'HAWKEYE',
    'CLINT BARTON': 'HAWKEYE',
    'BANNER': 'BRUCE BANNER',
    'BRUCE': 'BRUCE BANNER',
    'NATASHA': 'BLACK WIDOW',
    'NATASHA ROMANOFF': 'BLACK WIDOW',
    'STEVE': 'CAPTAIN AMERICA',
    'Dr. ERIK SELVIG': 'SELVIG',
    'DR. ERIK SELVIG': "SELVIG",
    'NICK FURY THE TESSERACT IS WHERE IT BELONGS': "NICK FURY",
    'FURY': 'NICK FURY',
    'TONY': 'IRON MAN',
    'TONY OF COURSE THEY ARE, I WAS DIRECTLY INVOLVED. WHICH BRINGS ME TO MY NEXT QUESTION': "IRON MAN",
    '[TIGHT ON IRON MAN': "IRON MAN",
    'PEPPER POTTS': 'PEPPER',
    'Loki': 'LOKI'
}

In [8]:
# 캐릭터명 아닌 것 정리
df.Source = df.Source.apply(lambda x: re.sub(r"\([\w\W]*\)", "", x))
df.Source = df.Source.apply(lambda x: re.sub(r'CUT TO', "", x))
df.Source = df.Source.apply(lambda x: re.sub(r'1', "", x))
df.Source = df.Source.apply(lambda x: re.sub(r'2', "", x))
df.Source = df.Source.apply(lambda x: re.sub(r'3', "", x))

df.Target = df.Target.apply(lambda x: re.sub(r"\([\w\W]*\)", "", x))
df.Target = df.Target.apply(lambda x: re.sub(r'CUT TO', "", x))
df.Target = df.Target.apply(lambda x: re.sub(r'1', "", x))
df.Target = df.Target.apply(lambda x: re.sub(r'2', "", x))
df.Target = df.Target.apply(lambda x: re.sub(r'3', "", x))


# 캐릭터 명 통일
df.Source = df.Source.apply(lambda x: x.strip())
df.Target = df.Target.apply(lambda x: x.strip())

for index, row in df.iterrows():
    if row.Source in synonyms:
        row.Source = synonyms[row.Source]
        
    if row.Target in synonyms:
        row.Target = synonyms[row.Target]

In [9]:
set(list(df.Source.unique()) + list(df.Target.unique()))

{'',
 'ATTENDING WOMAN',
 'BLACK WIDOW',
 'BRUCE BANNER',
 'CAPTAIN AMERICA',
 'COULSON',
 'COUNCIL MEMBER',
 'ELDER GERMAN MAN',
 'ESCORT 0-6 PILOT',
 'ESCORT 606 PILOT',
 'GALAGA PLAYER',
 'GENERAL LUCHKOV',
 'HAWKEYE',
 'HELMSMAN',
 'HILL',
 'HULK',
 'IRON MAN',
 'JARVIS',
 'LITTLE GIRL',
 'LOKI',
 'LUCHKOV',
 'MARIA HILL',
 'NASA SCIENCETIST',
 'NICK FURY',
 'OLD MAN',
 'PEGGY CARTER',
 'PEPPER',
 'PILOT',
 'POLICE SERGEANT',
 'SECURITY GUARD',
 'SELVIG',
 'SENATOR BOYNTON',
 'SHIELD AGENT',
 'SHIELD BASE VOICE',
 'SHIELD SCIENCETIST',
 'SITWELL',
 'THE OTHER',
 'THOR',
 'WAITRESS',
 'WEASELY THUG',
 'YOUNG COP',
 'YOUNG SHIELD PILOT'}

## 추가 전처리

In [10]:
avengers_1 = preprocess(df=df, series_no=1)

print(avengers_1.shape)
avengers_1.head()

(56, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Source,Target,Type,weight,series
0,CAPTAIN AMERICA,BLACK WIDOW,Undirected,29,1
1,THOR,LOKI,Undirected,28,1
2,NICK FURY,CAPTAIN AMERICA,Undirected,22,1
3,NICK FURY,LOKI,Undirected,15,1
4,CAPTAIN AMERICA,THOR,Undirected,15,1


In [11]:
avengers_1[avengers_1.Source == "CAPTAIN AMERICA"]

Unnamed: 0,Source,Target,Type,weight,series
0,CAPTAIN AMERICA,BLACK WIDOW,Undirected,29,1
4,CAPTAIN AMERICA,THOR,Undirected,15,1
11,CAPTAIN AMERICA,COULSON,Undirected,10,1
15,CAPTAIN AMERICA,HAWKEYE,Undirected,8,1
34,CAPTAIN AMERICA,POLICE SERGEANT,Undirected,1,1
49,CAPTAIN AMERICA,PEGGY CARTER,Undirected,1,1


In [12]:
avengers_1[avengers_1.Target == "CAPTAIN AMERICA"]

Unnamed: 0,Source,Target,Type,weight,series
2,NICK FURY,CAPTAIN AMERICA,Undirected,22,1
35,YOUNG COP,CAPTAIN AMERICA,Undirected,1,1
44,PILOT,CAPTAIN AMERICA,Undirected,1,1


# 어벤져스 2편 (에이지 오브 울트론)

## 데이터 불러오기

In [13]:
path = "./Data/Avengers2.txt"

script_by_scene = get_script_by_scene(path=path)

In [14]:
len(script_by_scene)

41

In [15]:
script_by_scene[1].split("\n")

[' 01',
 'iron man: Shit!',
 "captain america : Language! JARVIS, what's the view from upstairs?",
 "JARVIS: The central building is protected by some kind of energy shield. Strucker's technology is well beyond any other Hydra base we've taken.",
 "Thor: Loki's scepter must be here. Strucker couldn't mount this defense without it. At long last.",
 '[Natasha knocks out some soldiers]',
 'black widow: At long last is lasting a little long, boys.',
 '[As some soldiers shoot at him]',
 'hawkeye: Yeah. I think we lost the element of surprise.',
 'iron man: Wait a second. No one else is going to deal with the fact that Cap just said "language?"',
 'captain america: I know.',
 '[Steve throws his bike at some soldiers driving up in their truck]',
 'cap: It just slipped out.',
 '']

## 가중치 데이터프레임 생성

In [16]:
df = make_weight_df(script_by_scene=script_by_scene)
df

Unnamed: 0,Source,Target,Type,weight
0,IRON MAN,CAPTAIN AMERICA,Undirected,1
1,CAPTAIN AMERICA,JARVIS,Undirected,1
2,JARVIS,THOR,Undirected,1
3,THOR,BLACK WIDOW,Undirected,1
4,BLACK WIDOW,HAWKEYE,Undirected,1
...,...,...,...,...
165,WANDA MAXIMOFF,WANDA MAXIMOFF,Undirected,1
166,WANDA MAXIMOFF,FRIDAY,Undirected,1
167,NATASHA ROMANOFF,ZRINKA,Undirected,1
168,ZRINKA,TONY STARK,Undirected,1


## 노드 (캐릭터) 명 전처리

In [17]:
# 동일한 인물명 지정
synonyms = {
    'BRUCE': "BRUCE BANNER",
    'CAP': 'CAPTAIN AMERICA',
    'CLINT BARTON': 'HAWKEYE',
    'FRIDAY RIGHT NOW THE IMPACT WOULD KILL THOUSANDS. ONCE IT GETS HIGH ENOUGH': "FRIDAY",
    "FRIDAY THERE'S THE REST OF THE VIBRANIUM. FUNCTION": "FRIDAY",
    'STEVE': 'CAPTAIN AMERICA',
    'STEVE ROGERS': 'CAPTAIN AMERICA',
    'STEVE ROGERS INCOMING ALREADY CAME IN. STARK, YOU WORRY ABOUT BRINGING THE CITY BACK DOWN SAFELY. THE REST OF US HAVE ONE JOB': 'CAPTAIN AMERICA',
    "THOR'": "THOR",
    'TONY': "IRON MAN",
    'TONY STARK': "IRON MAN",
    'TONY STARK NEWS OR FOOTAGE, KEYWORD': "IRON MAN",
    "ULTRON THAT WAS DRAMATIC! I'M SORRY, I KNOW YOU MEAN WELL. YOU JUST DIDN'T THINK IT THROUGH. YOU WANT TO PROTECT THE WORLD, BUT YOU DON'T WANT IT TO CHANGE. HOW IS HUMANITY SAVED IF IT'S NOT ALLOWED TO...EVOLVE? [PICKS UP ONE OF THE DISMEMBERED IRON LEGIONS] WITH THESE? THESE PUPPETS? THERE'S ONLY ONE PATH TO PEACE": "ULTRON",
    'NATASHA ROMANOFF': 'BLACK WIDOW',
    'NAT': 'BLACK WIDOW',
    "SAM WILSON": "FALCON",
    'ERIK SELVIG': "SELVIG"
}

In [18]:
# 캐릭터 명 통일
df.Source = df.Source.apply(lambda x: x.strip())
df.Target = df.Target.apply(lambda x: x.strip())

for index, row in df.iterrows():
    if row.Source in synonyms:
        row.Source = synonyms[row.Source]
        
    if row.Target in synonyms:
        row.Target = synonyms[row.Target]

In [19]:
set(list(df.Source.unique()) + list(df.Target.unique()))

{'BALLET INSTRUCTOR',
 "BARTON'S DAUGHTER",
 'BLACK WIDOW',
 'BRUCE BANNER',
 'CAPTAIN AMERICA',
 'DR. HELEN CHO',
 'DR. LIST',
 'FALCON',
 'FORTRESS SOLDIER',
 'FRIDAY',
 'HAWKEYE',
 'HEIMDALL',
 'IRON LEGION',
 'IRON MAN',
 'JAMES RHODES',
 'JARVIS',
 "KLAUE'S MERCENARY",
 'LAURA BARTON',
 'LILA BARTON',
 'MADAME B',
 'MARIA HILL',
 'NICK FURY',
 'PARTY GUEST',
 'PEGGY CARTER',
 'PIETRO MAXIMOFF',
 'SELVIG',
 'SOLDIERS',
 'SPECIALIST CAMERON KLEIN',
 'STAN LEE',
 'STRUCKER',
 'THOR',
 'ULTRON',
 'ULYSSES KLAUE',
 'VISION',
 'WANDA MAXIMOFF',
 'WORLD HUB TECH',
 'ZRINKA'}

## 추가 전처리

In [20]:
avengers_2 = preprocess(df=df, series_no=2)

print(avengers_2.shape)
avengers_2.head()

(114, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Source,Target,Type,weight,series
0,IRON MAN,BRUCE BANNER,Undirected,56,2
1,BLACK WIDOW,CAPTAIN AMERICA,Undirected,27,2
2,HAWKEYE,LAURA BARTON,Undirected,23,2
3,IRON MAN,NICK FURY,Undirected,22,2
4,THOR,CAPTAIN AMERICA,Undirected,21,2


In [21]:
avengers_2[avengers_2.Source == "WANDA MAXIMOFF"]

Unnamed: 0,Source,Target,Type,weight,series
5,WANDA MAXIMOFF,ULTRON,Undirected,20,2
34,WANDA MAXIMOFF,HAWKEYE,Undirected,6,2
53,WANDA MAXIMOFF,BRUCE BANNER,Undirected,3,2
80,WANDA MAXIMOFF,FRIDAY,Undirected,1,2


In [22]:
avengers_2[avengers_2.Target == "WANDA MAXIMOFF"]

Unnamed: 0,Source,Target,Type,weight,series
9,PIETRO MAXIMOFF,WANDA MAXIMOFF,Undirected,17,2
14,CAPTAIN AMERICA,WANDA MAXIMOFF,Undirected,13,2
44,VISION,WANDA MAXIMOFF,Undirected,3,2
49,IRON MAN,WANDA MAXIMOFF,Undirected,3,2
55,DR. HELEN CHO,WANDA MAXIMOFF,Undirected,3,2
72,ULYSSES KLAUE,WANDA MAXIMOFF,Undirected,2,2


# 어벤져스 3편  (인피니티 워)

## 데이터 불러오기

In [23]:
path = "./Data/Avengers3.txt"

script_by_scene = get_script_by_scene(path=path)

In [24]:
len(script_by_scene)

59

In [25]:
script_by_scene[2].split("\n")

['02',
 '[Doctor Strange, Master of the Mystic Arts, proceeds down the main steps of the Sanctum with Wong.]',
 "Stephen Strange:\xa0[Dressed in casual American clothes.]\xa0Seriously? You don't have any money?",
 'Wong:\xa0[Dressed as Wong is always dressed.]\xa0Attachment to the material is detachment from\xa0the spiritual.',
 "Stephen Strange:\xa0I'll tell the guys at the deli.\xa0[Wryly]\xa0Maybe they'll make you a metaphysical ham on\xa0rye.",
 'Wong:\xa0Oh, wait, wait, wait, I think I have\xa0200.',
 'Stephen Strange:\xa0Dollars?',
 'Wong:\xa0Rupees.',
 'Stephen Strange:\xa0Which is?',
 'Wong:\xa0Uh,\xa0buck and a half.',
 'Stephen Strange:\xa0What do you want?',
 "Wong:\xa0I wouldn't say no to a tuna melt.",
 "[Bruce crash-lands through the Sanctum stairs. The Cloak of Levitation swirls around Strange's shoulders immediately.]",
 "Bruce Banner:\xa0Thanos is coming. He's coming...",
 'Stephen Strange:\xa0[Sharing a look with Wong, and now fully in his mage attire]\xa0Who?',
 '[Ti

## 가중치 데이터프레임 생성

In [26]:
df = make_weight_df(script_by_scene)
df

Unnamed: 0,Source,Target,Type,weight
0,[THE MARVEL STUDIOS LOGO PLAYS AS USUAL BUT TH...,EBONY MAW,Undirected,1
1,EBONY MAW,THANOS [LOOKING OUT THE LARGE WINDOW WE SAW AT...,Undirected,1
2,THANOS [LOOKING OUT THE LARGE WINDOW WE SAW AT...,THOR,Undirected,1
3,THOR,THANOS,Undirected,8
4,THANOS,LOKI,Undirected,7
...,...,...,...,...
198,PETER PARKER,NEBULA,Undirected,1
199,NEBULA,JAMES RHODES,Undirected,1
200,NICK FURY,MARIA HILL,Undirected,10
201,NICK FURY,MARIA HILL,Undirected,2


## 노드 (캐릭터) 명 전처리

In [27]:
# 동일한 인물명 지정
synonyms = {
    'F.R.I.D.A.Y.': "FRIDAY",
    'GAMORA\xa0THE ENTIRE TIME I KNEW THANOS, HE ONLY EVER HAD ONE GOAL': "GAMORA",
    'JAMES\xa0RHODES': 'JAMES RHODES',
    "T'CHALLA": "BLACK PANTHER",
    "KING\xa0T'CHALLA": "BLACK PANTHER",
    'MARIA\xa0HILL': 'MARIA HILL',
    'MEMORY GAMORA': "GAMORA",
    'MEMORY NEBULA': "NEBULA",
    'NATASHA ROMANOFF': 'BLACK WIDOW',
    'PEPPER POTTS': "PEPPER",
    'PETER PARKER': "SPIDERMAN",
    'PETER QUILL': "STAR-LORD",
    'PETER\xa0QUILL': "STAR-LORD",
    'SAM WILSON': "FALCON",
    'STEPHEN STRANGE': "DOCTOR STRANGE",
    'STEPHEN STRANGE\xa0WAIT, WHAT. THANOS?': "DOCTOR STRANGE",
    'STEPHEN\xa0STRANGE': "DOCTOR STRANGE",
    'STEVE ROGERS': "CAPTAIN AMERICA",
    'STONEKEEPER\xa0WE ALL THINK THAT AT FIRST.': 'STONEKEEPER',
    "THOR\xa0EITRI, THIS ISN'T ABOUT YOUR HANDS. EVERY WEAPON YOU'VE EVER DESIGNED": "THOR",
    'TONY STARK': "IRON MAN",
    'TONY STARK\xa0WHATEVER. POINT IS': "IRON MAN",
    "TONY STARK\xa0YEAH, I'M FINE. I JUST THINK WE MIGHT HAVE TO PUSH OUR 8": "IRON MAN",
    'YOUNG GAMORA': "GAMORA",
    'SCHOOL BUS DRIVER, STAN LEE': "STAN LEE"
}

In [28]:
# 캐릭터명 아닌 것 정리
df.Source = df.Source.apply(lambda x: re.sub(r"\[[\w\W]+", "", x))
df.Target = df.Target.apply(lambda x: re.sub(r"\[[\w\W]+", "", x))

# 캐릭터 명 통일
df.Source = df.Source.apply(lambda x: x.strip())
df.Target = df.Target.apply(lambda x: x.strip())

for index, row in df.iterrows():
    if row.Source in synonyms:
        row.Source = synonyms[row.Source]
        
    if row.Target in synonyms:
        row.Target = synonyms[row.Target]

In [29]:
set(list(df.Source.unique()) + list(df.Target.unique()))

{'',
 'BLACK PANTHER',
 'BLACK WIDOW',
 'BRUCE BANNER',
 'BUCKY BARNES',
 'CAPTAIN AMERICA',
 'CHILD OF THANOS',
 'COLLECTOR',
 'CORVUS GLAIVE',
 'CULL OBSIDIAN',
 'DOCTOR STRANGE',
 'DOME CONTROL',
 'DRAX',
 'EBONY MAW',
 'EITRI',
 'FALCON',
 'FRIDAY',
 'GAMORA',
 "GAMORA'S MOTHER",
 'GROOT',
 'HEIMDALL',
 'HULK',
 'IRON MAN',
 'JABARI WARRIORS',
 'JAMES RHODES',
 'LOKI',
 "M'BAKU",
 'MANTIS',
 'MARIA HILL',
 'NEBULA',
 'NED LEEDS',
 'NICK FURY',
 'OKOYE',
 'PEPPER',
 'PROXIMA MIDNIGHT',
 'RED SKULL',
 'ROCKET',
 'SECRETARY ROSS',
 'SHURI',
 'SPIDERMAN',
 'STAN LEE',
 'STAR-LORD',
 'STONEKEEPER',
 'THANOS',
 'THOR',
 'VISION',
 'WANDA MAXIMOFF',
 'WONG'}

## 추가 전처리

In [30]:
avengers_3 = preprocess(df=df, series_no=3)

print(avengers_3.shape)
avengers_3.head()

(134, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Source,Target,Type,weight,series
0,ROCKET,THOR,Undirected,52,3
1,IRON MAN,DOCTOR STRANGE,Undirected,51,3
2,SPIDERMAN,IRON MAN,Undirected,44,3
3,THANOS,GAMORA,Undirected,40,3
4,STAR-LORD,GAMORA,Undirected,28,3


In [31]:
avengers_3[avengers_3.Source == "CAPTAIN AMERICA"]

Unnamed: 0,Source,Target,Type,weight,series
40,CAPTAIN AMERICA,BLACK WIDOW,Undirected,4,3
46,CAPTAIN AMERICA,SECRETARY ROSS,Undirected,3,3
65,CAPTAIN AMERICA,THOR,Undirected,2,3
81,CAPTAIN AMERICA,OKOYE,Undirected,1,3
132,CAPTAIN AMERICA,WANDA MAXIMOFF,Undirected,1,3


In [32]:
avengers_3[avengers_3.Target == "CAPTAIN AMERICA"]

Unnamed: 0,Source,Target,Type,weight,series
26,BUCKY BARNES,CAPTAIN AMERICA,Undirected,5,3
28,VISION,CAPTAIN AMERICA,Undirected,5,3
29,BRUCE BANNER,CAPTAIN AMERICA,Undirected,5,3
44,FALCON,CAPTAIN AMERICA,Undirected,3,3
67,BLACK PANTHER,CAPTAIN AMERICA,Undirected,2,3
68,SHURI,CAPTAIN AMERICA,Undirected,2,3
80,PROXIMA MIDNIGHT,CAPTAIN AMERICA,Undirected,1,3
95,CORVUS GLAIVE,CAPTAIN AMERICA,Undirected,1,3
100,GROOT,CAPTAIN AMERICA,Undirected,1,3


# 어벤져스 4편 (엔드게임)

## 데이터 불러오기

In [33]:
path = "./Data/Avengers4.txt"

script_by_scene = get_script_by_scene(path=path)

In [34]:
len(script_by_scene)

84

In [35]:
script_by_scene[2].split("\n")

['01',
 '[SOMEWHERE IN\xa0SPACE]',
 '[Marvel Studios Opening sequence begins with Dear Mr. Fantasy playing. Only the heroes who survived the snap are shown.]',
 '[Scene switches to Nebula and Tony on the ship playing paper football]',
 'NEBULA:\xa0Wrra!\xa0[Nebula, frustrated, puts her hands in a fighting stance while looking at Tony.]',
 "TONY STARK:\xa0You don't need to do that. Because uh... you're just holding position.\xa0[Nebula flicks a paper football towards Tony]\xa0Oh yeah, that was close.\xa0[Nebula once again flicks a paper football towards Tony]\xa0That's a goal. We're now one apiece.",
 'NEBULA:\xa0I would like to try again.\xa0[Nebula flicks a paper football towards Tony]',
 "TONY STARK:\xa0We're tied up. Feel the tension? It's fun.",
 '[Tony poorly flicks a paper football towards Nebula]',
 'TONY STARK:\xa0That was terrible. Now you have a chance to win.\xa0[Nebula flicks the paper football towards Tony]',
 "TONY STARK:\xa0And... you've won. Congratulations.\xa0[Tony re

## 가중치 데이터프레임 생성

In [36]:
df = make_weight_df(script_by_scene)
df

Unnamed: 0,Source,Target,Type,weight
0,CLINT BARTON,LILA BARTON,Undirected,9
1,CLINT BARTON,CLINT BARTON,Undirected,7
2,CLINT BARTON,COOPER BARTON,Undirected,1
3,COOPER BARTON,LAURA BARTON,Undirected,3
4,LAURA BARTON,LILA BARTON,Undirected,1
...,...,...,...,...
255,BUCKY BARNES,SAM WILSON,Undirected,1
256,SAM WILSON,(OLD) STEVE ROGERS,Undirected,15
257,(OLD) STEVE ROGERS,(OLD) STEVE ROGERS,Undirected,1
258,(OLD) STEVE ROGERS,[CLOSING TITLE,Undirected,1


## 노드 (캐릭터) 명 전처리

In [37]:
set(list(df.Source.unique()) + list(df.Target.unique()))

{'(OLD) STEVE ROGERS',
 '1\xa0あいつだ! アキヒコさんを！(ROMANIZED AITSU DA! AKIHIKO-SAN WO!) (TRANSLATED',
 'AKIHIKO\xa0[IN A GATOTSU-LIKE SWORD STANCE]\xa0死ね!\xa0(ROMANIZED SHINE!) (ENGLISH',
 'AKIHIKO\xa0[JAPANESE]俺らがだと? 気でも狂ったか! (ORERA GA DATO? KI DEMO KURUTTA KA!)\xa0[HE GESTURES TO ALL THE SCATTERED BODIES OF HIS HENCHMEN AND CHUCKLES]\xa0(ENGLISH',
 'AKIHIKO\xa0てめえ なぜこんなことをする? 俺たちてめえになにもしてねぇだろ！(ROMANIZED TEMĒ NAZE KONNA KOTO WO SURU? ORETACHI TEMĒ NI NANI MO SHITENĒ DARO!) (ENGLISH',
 'AKIHIKO\xa0待て! 助けてくれ! お前に何でもやる! 何が欲しい?\xa0(ROMANIZED MATE! TASUKETE KURE! OMAE NI NAN DEMO YARU! NANI GA HOSĪ? (ENGLISH',
 'ALEXANDER PIERCE (2012)',
 'ASGARDIAN SOLDIER (2013)',
 'BOY',
 'BRUCE',
 'BRUCE (MEMORY)',
 'BRUCE BANNER',
 'BRUCE BANNER (2023)',
 "BRUCE BANNER\xa0I DON'T KNOW WHY EVERYONE BELIEVES THAT, BUT THAT ISN'T TRUE. THINK ABOUT IT",
 'BRUCE ROGERS',
 'BRUCE\xa0BANNER',
 'BUCKY BARNES',
 'CAPTAIN STEVENS (STEVE ROGERS)',
 'CAROL DANVERS',
 'CASSIE LANG',
 'CLINT',
 'CLINT BARTON',
 'CLINT BA

In [38]:
# 동일한 인물명 지정
synonyms = {
    'OUR WATCHER INFORMANT,\xa0STAN LEE (1922-2018 R.I.P)': "",
    
    '1\xa0あいつだ! アキヒコさんを！(ROMANIZED AITSU DA! AKIHIKO-SAN WO!) (TRANSLATED': "AKIHIKO",
    'AKIHIKO\xa0てめえ なぜこんなことをする? 俺たちてめえになにもしてねぇだろ！(ROMANIZED TEMĒ NAZE KONNA KOTO WO SURU? ORETACHI TEMĒ NI NANI MO SHITENĒ DARO!) (ENGLISH': "AKIHIKO",
    'AKIHIKO\xa0待て! 助けてくれ! お前に何でもやる! 何が欲しい?\xa0(ROMANIZED MATE! TASUKETE KURE! OMAE NI NAN DEMO YARU! NANI GA HOSĪ? (ENGLISH': "AKIHIKO",    
    '(OLD) STEVE ROGERS': "CAPTAIN AMERICA",
    'ALEXANDER PIERCE (2012)': "ALEXANDER PIERCE",
    'BRUCE': "BRUCE BANNER",
    'BRUCE (MEMORY)': "BRUCE BANNER",
    'BRUCE BANNER (2023)': "BRUCE BANNER",
    "BRUCE BANNER\xa0I DON'T KNOW WHY EVERYONE BELIEVES THAT, BUT THAT ISN'T TRUE. THINK ABOUT IT": "BRUCE BANNER",
    'BRUCE\xa0BANNER': "BRUCE BANNER",
    'GAMORA (2014)': "GAMORA",
    'CAPTAIN STEVENS (STEVE ROGERS)' : 'CAPTAIN AMERICA',
    'STEVE': 'CAPTAIN AMERICA',
    'STEVE ROGERS': 'CAPTAIN AMERICA',
    'STEVE ROGERS (2012)': 'CAPTAIN AMERICA',
    'CLINT': 'HAWKEYE',
    'CLINT BARTON': 'HAWKEYE',
    'CLINT BARTON (2012)' : 'HAWKEYE',
    'CLINT BARTON (2023)': 'HAWKEYE',
    'EBONY MAW (2014)': 'EBONY MAW',
    'F.R.I.D.A.Y': "FRIDAY",
    'F.R.I.D.A.Y.': "FRIDAY",
    'FRIGGA (2013)': "FRIGGA",
    'HANK\xa0PYM': "HANK PYM",
    'HULK (2012)': "HULK",
    'LOKI (2012)': "LOKI",
    'NATASHA': "BLACK WIDOW",
    'NATASHA (MEMORY)': "BLACK WIDOW",
    'NATASHA ROMANOFF': "BLACK WIDOW",
    'NATASHA ROMANOFF (2012)': "BLACK WIDOW",
    'NATASHA ROMANOFF\xa0(2012)': "BLACK WIDOW",
    "NATASHA ROMANOFF\xa0FOR THE LAST FIVE YEARS I'VE BEEN TRYING TO DO ONE THING": "BLACK WIDOW",
    'NATASHA ROMANONFF': "BLACK WIDOW",
    'NEBULA (MEMORY)': "NEBULA",
    'NEBULA (2014)': "NEBULA",
    'PETER PARKER': "SPIDERMAN",
    'PETER QUILL': "STAR-LORD",
    'QUILL': "STAR-LORD",
    'RED SKULL (2014)': "RED SKULL",
    'RHODEY (MEMORY)': "RHODEY",
    "ROCKET\xa0ALL RIGHT. HERE'S THE DEAL, TUBBY": "ROCKET",
    'SAM': "FALCON",
    'SAM WILSON': "FALCON",
    'SITWELL (2012)': "SITWELL",
    "T'CHALLA": "BLACK PANTHER",
    'TEEN SCOTT LANG': "ANTMAN",
    'OLD SCOTT LANG': "ANTMAN",
    'SCOTT LANG': "ANTMAN",
    'THANOS (2014)': "THANOS",
    'THANOS (MEMORY)': "THANOS",
    'THANOS\xa0(2014)': "THANOS",
    'THOR (2012)': "THOR",
    'TONY STARK (2012)': 'IRON MAN',
    'TONY STARK': 'IRON MAN',
    'TONY STARK (2012)': 'IRON MAN',
    'TONY STARK (2023)': 'IRON MAN',
    'TONY STARK (HOLOGRAM)': 'IRON MAN',
    'TONY STARK (VOICEOVER)': 'IRON MAN',
    'TONY STARK(2012)': 'IRON MAN',
    'TONY STARK\xa0WE GOT A SHOT AT GETTING THESE STONES, BUT I GOTTA TELL YOU MY PRIORITIES': 'IRON MAN',
    'TONY STARK\xa0WHY THE LONG FACE? LET ME GUESS': 'IRON MAN'
}

In [39]:
# 캐릭터명 아닌 것 정리
df.Source = df.Source.apply(lambda x: re.sub(r"\[[\w\W]+", "", x))
df.Target = df.Target.apply(lambda x: re.sub(r"\[[\w\W]+", "", x))

# 캐릭터 명 통일
df.Source = df.Source.apply(lambda x: x.strip())
df.Target = df.Target.apply(lambda x: x.strip())

for index, row in df.iterrows():
    if row.Source in synonyms:
        row.Source = synonyms[row.Source]
        
    if row.Target in synonyms:
        row.Target = synonyms[row.Target]

In [40]:
set(list(df.Source.unique()) + list(df.Target.unique()))

{'',
 'AKIHIKO',
 'ALEXANDER PIERCE',
 'ANTMAN',
 'ASGARDIAN SOLDIER (2013)',
 'BLACK PANTHER',
 'BLACK WIDOW',
 'BOY',
 'BRUCE BANNER',
 'BRUCE ROGERS',
 'BUCKY BARNES',
 'CAPTAIN AMERICA',
 'CAROL DANVERS',
 'CASSIE LANG',
 'COOPER BARTON',
 'CORVUS GLAIVE',
 'DOCTOR STRANGE',
 'DRAX',
 'EBONY MAW',
 'EDWIN JARVIS',
 'ELEVATOR LADY',
 'EVERYONE',
 'FALCON',
 'FRIDAY',
 'FRIGGA',
 'GAMORA',
 'GIRL',
 'GROOT',
 'HANK PYM',
 'HAPPY',
 'HAWKEYE',
 'HOPE VAN DYNE',
 'HOWARD STARK',
 'HULK',
 'HYDRA AGENT',
 'IRON MAN',
 'JAMES RHODES',
 'JIM STARLIN',
 'JOE RUSSO',
 'KORG',
 'LAURA BARTON',
 'LILA BARTON',
 'LOKI',
 'MANTIS',
 'MORGAN STARK',
 'NATHANIEL BARTON',
 'NEBULA',
 'OKOYE',
 'PEPPER POTTS',
 'RED SKULL',
 'RHODEY',
 'ROCKET',
 'RONIN',
 'RUMLOW (2012)',
 'SECURITY OFFICER',
 'SITWELL',
 'SPIDERMAN',
 'STAR-LORD',
 'THANOS',
 'THE ANCIENT ONE (2012)',
 'THE WASP',
 'THOR',
 'VALKYRIE',
 'WANDA MAXIMOFF',
 'WONG'}

# 추가 전처리

In [41]:
avengers_4 = preprocess(df=df, series_no=4)

print(avengers_4.shape)
avengers_4.head()

(147, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,Source,Target,Type,weight,series
0,IRON MAN,ANTMAN,Undirected,49,4
1,IRON MAN,HOWARD STARK,Undirected,45,4
2,FRIGGA,THOR,Undirected,35,4
3,BRUCE BANNER,CAPTAIN AMERICA,Undirected,31,4
4,BRUCE BANNER,ANTMAN,Undirected,29,4


In [42]:
avengers_4[avengers_4.Source == "THOR"]

Unnamed: 0,Source,Target,Type,weight,series
8,THOR,IRON MAN,Undirected,22,4
14,THOR,STAR-LORD,Undirected,13,4
15,THOR,KORG,Undirected,13,4
26,THOR,HAWKEYE,Undirected,8,4
34,THOR,CAPTAIN AMERICA,Undirected,6,4
41,THOR,ANTMAN,Undirected,5,4
98,THOR,DRAX,Undirected,1,4
111,THOR,SPIDERMAN,Undirected,1,4
131,THOR,EVERYONE,Undirected,1,4


In [43]:
avengers_4[avengers_4.Target == "THOR"]

Unnamed: 0,Source,Target,Type,weight,series
2,FRIGGA,THOR,Undirected,35,4
10,BRUCE BANNER,THOR,Undirected,17,4
21,VALKYRIE,THOR,Undirected,9,4
42,ALEXANDER PIERCE,THOR,Undirected,5,4
56,RHODEY,THOR,Undirected,3,4
79,HULK,THOR,Undirected,2,4
123,CAROL DANVERS,THOR,Undirected,1,4
134,BRUCE ROGERS,THOR,Undirected,1,4
138,ASGARDIAN SOLDIER (2013),THOR,Undirected,1,4
145,LOKI,THOR,Undirected,1,4


# 데이터 저장

In [44]:
avengers_1.to_csv("./Data/avengers1.csv", index=False)
avengers_2.to_csv("./Data/avengers2.csv", index=False)
avengers_3.to_csv("./Data/avengers3.csv", index=False)
avengers_4.to_csv("./Data/avengers4.csv", index=False)