### 데이터셋: 유엔총회 일반토의

#### 1. 데이터 가공하기

In [5]:
# 데이터 불러오기
import pandas as pd

speaker_df = pd.read_csv('./data/Speakers_by_session.csv')

print(len(speaker_df))
print(speaker_df.columns)

7704
Index(['Year', 'Session', 'ISO Code', 'Country', 'Name of Person Speaking',
       'Post', 'Language', 'Notes'],
      dtype='object')


In [6]:
column_mapping = {
    'Year': 'year',
    'Session': 'session',
    'ISO Code': 'country',
    'Country': 'country_name',
    'Name of Person Speaking': 'speaker',
    'Post': 'position',
    'Language': 'language',
    'Notes': 'notes'
}

# select the new columns, i.e. not mapped to None
columns = [c for c in column_mapping.keys() if column_mapping[c] != None]

# select and rename columns
speaker_df = speaker_df[columns].rename(columns=column_mapping)

##### There is a wrong country code in one session:

In [7]:
# bug in data: SLV is not Slovenia
speaker_df[(speaker_df['session'] == 59) & (speaker_df['country_name'] == 'Slovenia')]

Unnamed: 0,year,session,country,country_name,speaker,position,language,notes
2269,2004.0,59.0,SLV,Slovenia,Mr. Janez Drnovšek,President,Slovenian,Spoke in Slovene; English text provided by the...


In [9]:
speaker_df.loc[((speaker_df['session'] == 59) & (speaker_df['country_name'] == 'Slovenia')), 'country'] = 'SVN'
speaker_df[(speaker_df['session'] == 59) & (speaker_df['country_name'] == 'Slovenia')]

Unnamed: 0,year,session,country,country_name,speaker,position,language,notes
2269,2004.0,59.0,SVN,Slovenia,Mr. Janez Drnovšek,President,Slovenian,Spoke in Slovene; English text provided by the...


##### Create index on session and country for join with speeches data frame:

In [10]:
# verify_integrity=True : 중복 인덱스가 있는지 확인하여 오류를 발생시키는 구문
speaker_df.dropna(subset=['session', 'country'], inplace=True)
speaker_df.set_index(['session', 'country'], inplace=True, verify_integrity=True)

#### 2. Speeches Data 전처리

In [11]:
df = pd.read_csv('./data/un-general-debates-blueprint.csv')
print(len(df))

7507


##### pycountry 사용하여 표준화된 국가명 생성하기

In [26]:
pip install pycountry

Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
     --------------------------------------- 10.1/10.1 MB 43.1 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: pycountry
  Building wheel for pycountry (pyproject.toml): started
  Building wheel for pycountry (pyproject.toml): finished with status 'done'
  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681896 sha256=9bd45e490114a98fa8b5aebe76f1b65d6650c67fe26a3999457ec8de72b0031d
  Stored in directory: c:\users\lg\appdata\local\pip\cache\wheels\47\15\92\e6dc85fcb0686c82e1edbcfdf80cfe4808c058813fed0baa8f
Successfully built pycountry
Installing collect

You should consider upgrading via the 'c:\Users\LG\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [12]:
import pycountry

def country_for_iso(iso):
    if iso == 'YUG':
        return 'Yugoslavia'
    elif iso == 'CSK':
        return 'Czechoslovakia'
    elif iso == 'DDR':
        return 'German Dem. Republic'
    elif iso == 'YDYE':
        return 'Democratic Yemen'
    elif iso == 'EU':
        return 'European Union'
    else:
        return pycountry.countries.get(alpha_3=iso).name

In [13]:
# there are two codes for Yemen
# df.loc[df['country']=='YDYE', 'country'] = 'YEM'

df['country_name'] = df['country'].apply(country_for_iso)

In [14]:
df.set_index(['session', 'country'], inplace=True, verify_integrity=True)
df.reset_index(inplace=True)

#### 3. speech와 speaker 데이터 합치기

In [15]:
df = df.join(speaker_df, on=['session', 'country'], rsuffix='_spk')
df.columns
len(df)

7507

In [17]:
df = df[['session', 'year', 'country', 'country_name', 'speaker', 'position', 'text']]

df['session'] = df['session'].astype(int)
df['year'] = df['year'].astype(int)

df.sort_values(by=['year', 'session', 'country'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['session'] = df['session'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = df['year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(by=['year', 'session', 'country'], inplace=True)


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7507 entries, 0 to 7506
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   session       7507 non-null   int32 
 1   year          7507 non-null   int32 
 2   country       7507 non-null   object
 3   country_name  7507 non-null   object
 4   speaker       7480 non-null   object
 5   position      4502 non-null   object
 6   text          7507 non-null   object
dtypes: int32(2), object(5)
memory usage: 352.0+ KB


#### 4. 저장하기

In [None]:
df.to_csv("un-general-debates-blueprint.csv.gz", index=False)